From cf5388307a2b4faab4b11d732b61c85741be6169 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 4 Jul 2012 15:42:48 +0200 Subject: Btrfs: fix buffer leak in btrfs_next_old_leaf When calling btrfs_next_old_leaf, we were leaking an extent buffer in the rare case of using the deadlock avoidance code needed for the tree mod log. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8206b390058..67fe46fdee6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5127,6 +5127,7 @@ again: * locked. To solve this situation, we give up * on our lock and cycle. */ + free_extent_buffer(next); btrfs_release_path(path); cond_resched(); goto again; -- cgit v1.2.3 From 097b8a7c9e48e2cb50fd0eb9315791921beaf484 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Jun 2012 11:08:04 +0200 Subject: Btrfs: join tree mod log code with the code holding back delayed refs We've got two mechanisms both required for reliable backref resolving (tree mod log and holding back delayed refs). You cannot make use of one without the other. So instead of requiring the user of this mechanism to setup both correctly, we join them into a single interface. Additionally, we stop inserting non-blockers into fs_info->tree_mod_seq_list as we did before, which was of no value. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 30 ++---- fs/btrfs/backref.h | 3 +- fs/btrfs/ctree.c | 275 ++++++++++++++++++++++++++++++------------------- fs/btrfs/ctree.h | 31 +++--- fs/btrfs/delayed-ref.c | 44 ++++---- fs/btrfs/delayed-ref.h | 49 +-------- fs/btrfs/disk-io.c | 2 + fs/btrfs/extent-tree.c | 21 ++-- fs/btrfs/transaction.c | 4 - 9 files changed, 240 insertions(+), 219 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a383c18e74e..7d80ddd8f54 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist *refs, struct ulist *roots, - const u64 *extent_item_pos) + u64 time_seq, struct ulist *refs, + struct ulist *roots, const u64 *extent_item_pos) { struct btrfs_key key; struct btrfs_path *path; @@ -837,7 +836,7 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, delayed_ref_seq, + ret = __add_delayed_refs(head, time_seq, &prefs_delayed); mutex_unlock(&head->mutex); if (ret) { @@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks) */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **leafs, + u64 time_seq, struct ulist **leafs, const u64 *extent_item_pos) { struct ulist *tmp; @@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, *leafs, tmp, extent_item_pos); ulist_free(tmp); @@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **roots) + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); @@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list seq_elem = {}; struct seq_list tree_mod_seq_elem = {}; struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; - struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); @@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, tree_mod_seq_elem.seq, &refs, + tree_mod_seq_elem.seq, &refs, &extent_item_pos); if (ret) goto out; @@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - seq_elem.seq, - tree_mod_seq_elem.seq, &roots); + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); - btrfs_put_delayed_seq(delayed_refs, &seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index c18d8ac7b79..3a1ad3e2dcb 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,8 +58,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **roots); + u64 time_seq, struct ulist **roots); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 67fe46fdee6..bef68ab3220 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -321,7 +321,7 @@ struct tree_mod_root { struct tree_mod_elem { struct rb_node node; u64 index; /* shifted logical */ - struct seq_list elem; + u64 seq; enum mod_log_op op; /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ @@ -341,20 +341,50 @@ struct tree_mod_elem { struct tree_mod_root old_root; }; -static inline void -__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) { - elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + read_lock(&fs_info->tree_mod_log_lock); } -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) +static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) +{ + read_unlock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) +{ + write_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) { - elem->flags = 1; + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + u64 seq; + + tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); - __get_tree_mod_seq(fs_info, elem); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + } + seq = btrfs_inc_tree_mod_seq(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); + + return seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - BUG_ON(!(elem->flags & 1)); spin_lock(&fs_info->tree_mod_seq_lock); list_del(&elem->list); + elem->seq = 0; list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (cur_elem->seq < min_seq) { if (seq_putting > cur_elem->seq) { /* * blocker with lower sequence number exists, we * cannot remove anything from the log */ - goto out; + spin_unlock(&fs_info->tree_mod_seq_lock); + return; } min_seq = cur_elem->seq; } } + spin_unlock(&fs_info->tree_mod_seq_lock); + + /* + * we removed the lowest blocker from the blocker list, so there may be + * more processible delayed refs. + */ + wake_up(&fs_info->tree_mod_seq_wait); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); + tree_mod_log_write_lock(fs_info); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = container_of(node, struct tree_mod_elem, node); - if (tm->elem.seq > min_seq) + if (tm->seq > min_seq) continue; rb_erase(node, tm_root); - list_del(&tm->elem.list); kfree(tm); } - write_unlock(&fs_info->tree_mod_log_lock); -out: - spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); } /* @@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; - BUG_ON(!tm || !tm->elem.seq); + BUG_ON(!tm || !tm->seq); - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; while (*new) { @@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->index > tm->index) new = &((*new)->rb_right); - else if (cur->elem.seq < tm->elem.seq) + else if (cur->seq < tm->seq) new = &((*new)->rb_left); - else if (cur->elem.seq > tm->elem.seq) + else if (cur->seq > tm->seq) new = &((*new)->rb_right); else { kfree(tm); - ret = -EEXIST; - goto unlock; + return -EEXIST; } } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -unlock: - write_unlock(&fs_info->tree_mod_log_lock); - return ret; + return 0; } +/* + * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it + * returns zero with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * call tree_mod_log_write_unlock() to release. + */ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { smp_mb(); if (list_empty(&(fs_info)->tree_mod_seq_list)) return 1; - if (!eb) - return 0; - if (btrfs_header_level(eb) == 0) + if (eb && btrfs_header_level(eb) == 0) return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list when no blocker exists. + */ + tree_mod_log_write_unlock(fs_info); + return 1; + } + return 0; } /* - * This allocates memory and gets a tree modification sequence number when - * needed. + * This allocates memory and gets a tree modification sequence number. * - * Returns 0 when no sequence number is needed, < 0 on error. - * Returns 1 when a sequence number was added. In this case, - * fs_info->tree_mod_seq_lock was acquired and must be released by the caller - * after inserting into the rb tree. + * Returns <0 on error. + * Returns >0 (the added sequence number) on success. */ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { struct tree_mod_elem *tm; - int seq; - if (tree_mod_dont_log(fs_info, NULL)) - return 0; - - tm = *tm_ret = kzalloc(sizeof(*tm), flags); + /* + * once we switch from spin locks to something different, we should + * honor the flags parameter here. + */ + tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) return -ENOMEM; - tm->elem.flags = 0; - spin_lock(&fs_info->tree_mod_seq_lock); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list, because no blocker exists. items - * are removed from the list only when the existing blocker is - * removed from the list. - */ - kfree(tm); - seq = 0; - spin_unlock(&fs_info->tree_mod_seq_lock); - } else { - __get_tree_mod_seq(fs_info, &tm->elem); - seq = tm->elem.seq; - } - - return seq; + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + return tm->seq; } -static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +static inline int +__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { - struct tree_mod_elem *tm; int ret; + struct tree_mod_elem *tm; ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) + if (ret < 0) return ret; tm->index = eb->start >> PAGE_CACHE_SHIFT; @@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); - ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + int ret; + + if (tree_mod_dont_log(fs_info, eb)) + return 0; + + ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + + tree_mod_log_write_unlock(fs_info); return ret; } @@ -542,6 +582,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); } +static noinline int +tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op) +{ + return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); +} + static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, @@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, return 0; for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, MOD_LOG_KEY_REMOVE_WHILE_MOVING); BUG_ON(ret < 0); } ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (ret < 0) + goto out; tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; @@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->op = MOD_LOG_MOVE_KEYS; ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); +out: + tree_mod_log_write_unlock(fs_info); return ret; } +static inline void +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +{ + int i; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key_locked(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, @@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm; int ret; + if (tree_mod_dont_log(fs_info, NULL)) + return 0; + + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (ret < 0) + goto out; tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->op = MOD_LOG_ROOT_REPLACE; ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); +out: + tree_mod_log_write_unlock(fs_info); return ret; } @@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct tree_mod_elem *found = NULL; u64 index = start >> PAGE_CACHE_SHIFT; - read_lock(&fs_info->tree_mod_log_lock); + tree_mod_log_read_lock(fs_info); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { @@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, node = node->rb_left; } else if (cur->index > index) { node = node->rb_right; - } else if (cur->elem.seq < min_seq) { + } else if (cur->seq < min_seq) { node = node->rb_left; } else if (!smallest) { /* we want the node with the highest seq */ if (found) - BUG_ON(found->elem.seq > cur->elem.seq); + BUG_ON(found->seq > cur->seq); found = cur; node = node->rb_left; - } else if (cur->elem.seq > min_seq) { + } else if (cur->seq > min_seq) { /* we want the node with the smallest seq */ if (found) - BUG_ON(found->elem.seq < cur->elem.seq); + BUG_ON(found->seq < cur->seq); found = cur; node = node->rb_right; } else { @@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, break; } } - read_unlock(&fs_info->tree_mod_log_lock); + tree_mod_log_read_unlock(fs_info); return found; } @@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static inline void +static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) @@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (tree_mod_dont_log(fs_info, NULL)) return; - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { + tree_mod_log_write_unlock(fs_info); return; + } - /* speed this up by single seq for all operations? */ for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, - MOD_LOG_KEY_REMOVE); + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); - ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, - MOD_LOG_KEY_ADD); + ret = tree_mod_log_insert_key_locked(fs_info, dst, + i + dst_offset, + MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } + + tree_mod_log_write_unlock(fs_info); } static inline void @@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, BUG_ON(ret < 0); } -static inline void +static noinline void tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int slot, int atomic) @@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, BUG_ON(ret < 0); } -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static noinline void +tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - int i; - int ret; - u32 nritems; - if (tree_mod_dont_log(fs_info, eb)) return; - nritems = btrfs_header_nritems(eb); - for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); - BUG_ON(ret < 0); - } + __tree_mod_log_free_eb(fs_info, eb); + + tree_mod_log_write_unlock(fs_info); } -static inline void +static noinline void tree_mod_log_set_root_pointer(struct btrfs_root *root, struct extent_buffer *new_root_node) { int ret; - tree_mod_log_free_eb(root->fs_info, root->node); ret = tree_mod_log_insert_root(root->fs_info, root->node, new_root_node, GFP_NOFS); BUG_ON(ret < 0); @@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); - while (tm && tm->elem.seq >= time_seq) { + while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for * the modification. as we're going backwards, we do the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84ac723f58f..8f8dc46f44e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1030,6 +1030,13 @@ struct btrfs_block_group_cache { struct list_head cluster_list; }; +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; +}; + +/* fs_info */ struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; @@ -1144,6 +1151,8 @@ struct btrfs_fs_info { spinlock_t tree_mod_seq_lock; atomic_t tree_mod_seq; struct list_head tree_mod_seq_list; + struct seq_list tree_mod_seq_elem; + wait_queue_head_t tree_mod_seq_wait; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -2798,6 +2807,16 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info); } +/* tree mod log functions from ctree.c */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic_inc_return(&fs_info->tree_mod_seq); +} + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -3157,18 +3176,6 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* delayed seq elem */ -struct seq_list { - struct list_head list; - u64 seq; - u32 flags; -}; - -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 13ae7b04790..21a75771763 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 seq) { struct seq_list *elem; - - assert_spin_locked(&delayed_refs->lock); - if (list_empty(&delayed_refs->seq_head)) - return 0; - - elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); - if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", - seq, elem->seq, delayed_refs); - return 1; + int ret = 0; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %llu, lowest is " + "%llu (%p)\n", seq, elem->seq, delayed_refs); + ret = 1; + } } - return 0; + + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, @@ -526,7 +530,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->in_tree = 1; if (is_fstree(ref_root)) - seq = inc_delayed_seq(delayed_refs); + seq = btrfs_inc_tree_mod_seq(fs_info); ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -585,7 +589,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->in_tree = 1; if (is_fstree(ref_root)) - seq = inc_delayed_seq(delayed_refs); + seq = btrfs_inc_tree_mod_seq(fs_info); ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -659,8 +663,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, num_bytes, parent, ref_root, level, action, for_cow); if (!is_fstree(ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + waitqueue_active(&fs_info->tree_mod_seq_wait)) + wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); return 0; @@ -708,8 +712,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, num_bytes, parent, ref_root, owner, offset, action, for_cow); if (!is_fstree(ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + waitqueue_active(&fs_info->tree_mod_seq_wait)) + wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); return 0; @@ -736,8 +740,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (waitqueue_active(&fs_info->tree_mod_seq_wait)) + wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 413927fb995..2b5cb27f986 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; - - /* - * seq number of delayed refs. We need to know if a backref was being - * added before the currently processed ref or afterwards. - */ - u64 seq; - - /* - * seq_list holds a list of all seq numbers that are currently being - * added to the list. While walking backrefs (btrfs_find_all_roots, - * qgroups), which might take some time, no newer ref must be processed, - * as it might influence the outcome of the walk. - */ - struct list_head seq_head; - - /* - * when the only refs we have in the list must not be processed, we want - * to wait for more refs to show up or for the end of backref walking. - */ - wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -195,33 +175,8 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); -static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) -{ - assert_spin_locked(&delayed_refs->lock); - ++delayed_refs->seq; - return delayed_refs->seq; -} - -static inline void -btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - assert_spin_locked(&delayed_refs->lock); - elem->seq = delayed_refs->seq; - list_add_tail(&elem->list, &delayed_refs->seq_head); -} - -static inline void -btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - spin_lock(&delayed_refs->lock); - list_del(&elem->list); - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); -} - -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 seq); /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8cc47103a32..19a39e10d6f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1944,6 +1944,8 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; + init_waitqueue_head(&fs_info->tree_mod_seq_wait); + /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1d36702ff..94ce79f76e5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2217,6 +2217,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int count = 0; int must_insert_reserved = 0; @@ -2255,7 +2256,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { /* * there are still refs with lower seq numbers in the * process of being added. Don't run this ref yet. @@ -2337,7 +2338,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } next: - do_chunk_alloc(trans, root->fs_info->extent_root, + do_chunk_alloc(trans, fs_info->extent_root, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); @@ -2347,18 +2348,19 @@ next: return count; } -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, +static void wait_for_more_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, unsigned long num_refs, struct list_head *first_seq) { spin_unlock(&delayed_refs->lock); pr_debug("waiting for more refs (num %ld, first %p)\n", num_refs, first_seq); - wait_event(delayed_refs->seq_wait, + wait_event(fs_info->tree_mod_seq_wait, num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); + fs_info->tree_mod_seq_list.next != first_seq); pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); + delayed_refs->num_entries, fs_info->tree_mod_seq_list.next); spin_lock(&delayed_refs->lock); } @@ -2403,6 +2405,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, again: consider_waiting = 0; spin_lock(&delayed_refs->lock); + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; @@ -2437,7 +2440,7 @@ again: num_refs = delayed_refs->num_entries; first_seq = root->fs_info->tree_mod_seq_list.next; } else { - wait_for_more_refs(delayed_refs, + wait_for_more_refs(root->fs_info, delayed_refs, num_refs, first_seq); /* * after waiting, things have changed. we @@ -5190,8 +5193,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) + wake_up(&root->fs_info->tree_mod_seq_wait); /* * we don't take a ref on the node because we're removing it from the diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b72b068183e..621c8dc48fb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -38,7 +38,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -126,7 +125,6 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.seq = 1; /* * although the tree mod log is per file system and not per transaction, @@ -145,10 +143,8 @@ loop: } atomic_set(&fs_info->tree_mod_seq, 0); - init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); - INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &fs_info->trans_list); -- cgit v1.2.3 From 630dc772ea51bca3ec6fac609f450cbe0cafd1d6 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 11:06:07 +0200 Subject: Btrfs: qgroup on-disk format Not all features are in use by the current version and thus may change in the future. Signed-off-by: Arne Jansen --- fs/btrfs/ctree.h | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f8dc46f44e..33088b0dbf3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -91,6 +91,9 @@ struct btrfs_ordered_sum; /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -883,6 +886,72 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * SCANNING is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 scan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +/* flags definition for qgroup limits */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + struct btrfs_space_info { u64 flags; @@ -1534,6 +1603,30 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +/* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + #define BTRFS_BALANCE_ITEM_KEY 248 /* @@ -2474,6 +2567,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, + scan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; -- cgit v1.2.3 From 2f38b3e1900634e64a186873b3388b1bf85dabc0 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 11:18:10 +0200 Subject: Btrfs: add helper for tree enumeration Often no exact match is wanted but just the next lower or higher item. There's a lot of duplicated code throughout btrfs to deal with the corner cases. This patch adds a helper function that can facilitate searching. Signed-off-by: Arne Jansen --- fs/btrfs/ctree.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 3 +++ 2 files changed, 75 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index bef68ab3220..fb21431fe4e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2788,6 +2788,78 @@ done: return ret; } +/* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + /* we're sitting on an invalid slot */ + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } + --p->slots[0]; + } + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 33088b0dbf3..27cf995564e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2856,6 +2856,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, -- cgit v1.2.3 From d13603ef6e14a12cd65a6975e8117c0fea7c7ddf Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 11:40:09 +0200 Subject: Btrfs: check the root passed to btrfs_end_transaction This patch only add a consistancy check to validate that the same root is passed to start_transaction and end_transaction. Subvolume quota depends on this. Signed-off-by: Arne Jansen --- fs/btrfs/transaction.c | 6 ++++++ fs/btrfs/transaction.h | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 621c8dc48fb..23cbda0685b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -345,6 +345,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; h->block_rsv = NULL; @@ -511,6 +512,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + /* + * the same root has to be passed to start_transaction and + * end_transaction. Subvolume quota depends on this. + */ + WARN_ON(trans->root != root); while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fe27379e368..010729446e1 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,6 +57,12 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; int aborted; + /* + * this root is only needed to validate that the root passed to + * start_transaction is the same as the one passed to end_transaction. + * Subvolume quota depends on this + */ + struct btrfs_root *root; }; struct btrfs_pending_snapshot { -- cgit v1.2.3 From 20897f5c86b9d2b77baea1d48eda7fa4ac217279 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 12:44:20 +0200 Subject: Btrfs: added helper to create new trees This creates a brand new tree. Will be used to create the quota tree. Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/disk-io.h | 6 +++++ 2 files changed, 83 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 19a39e10d6f..6fc243eccff 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1225,6 +1225,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) return root; } +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + struct extent_buffer *leaf; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key key; + int ret = 0; + u64 bytenr; + + root = btrfs_alloc_root(fs_info); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + root->root_key.objectid = objectid; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + bytenr = leaf->start; + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + root->node = leaf; + + write_extent_buffer(leaf, fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + root->commit_root = btrfs_root_node(root); + root->track_dirty = 1; + + + root->root_item.flags = 0; + root->root_item.byte_limit = 0; + btrfs_set_root_bytenr(&root->root_item, leaf->start); + btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 1); + btrfs_set_root_used(&root->root_item, leaf->len); + btrfs_set_root_last_snapshot(&root->root_item, 0); + btrfs_set_root_dirid(&root->root_item, 0); + root->root_item.drop_level = 0; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + if (ret) + goto fail; + + btrfs_tree_unlock(leaf); + +fail: + if (ret) + return ERR_PTR(ret); + + return root; +} + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -3260,7 +3336,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } -static int btree_lock_page_hook(struct page *page, void *data, +int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 05b3fab39f7..95e147eea23 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); +void btrfs_abort_devices(struct btrfs_root *root); +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid); +int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); -- cgit v1.2.3 From 416ac51da90e98daaac17e1f359a6c5591f7f5bd Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 12:56:09 +0200 Subject: Btrfs: qgroup state and initialization Add state to fs_info. Signed-off-by: Arne Jansen --- fs/btrfs/ctree.h | 24 ++++++++++++++++++++++++ fs/btrfs/disk-io.c | 7 +++++++ 2 files changed, 31 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 27cf995564e..a5269d4a164 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1120,6 +1120,7 @@ struct btrfs_fs_info { struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; + struct btrfs_root *quota_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1374,6 +1375,29 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif + /* + * quota information + */ + unsigned int quota_enabled:1; + + /* + * quota_enabled only changes state after a commit. This holds the + * next state. + */ + unsigned int pending_quota_state:1; + + /* is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* holds configuration and tracking. Protected by qgroup_lock */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* list of dirty qgroups to be written at next commit */ + struct list_head dirty_qgroups; + + /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + u64 qgroup_seq; /* filesystem state */ u64 fs_state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6fc243eccff..eca05497442 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2110,6 +2110,13 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + spin_lock_init(&fs_info->qgroup_lock); + fs_info->qgroup_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); -- cgit v1.2.3 From 709c0486b9fe9586736b108b7233bbce0300cfa5 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 12 Sep 2011 12:22:57 +0200 Subject: Btrfs: Test code to change the order of delayed-ref processing Normally delayed refs get processed in ascending bytenr order. This correlates in most cases to the order added. To expose dependencies on this order, we start to process the tree in the middle instead of the beginning. This code is only effective when SCRAMBLE_DELAYED_REFS is defined. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 94ce79f76e5..b13f1fbc373 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,8 @@ #include "locking.h" #include "free-space-cache.h" +#undef SCRAMBLE_DELAYED_REFS + /* * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk @@ -2364,6 +2366,49 @@ static void wait_for_more_refs(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); } +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2406,6 +2451,10 @@ again: consider_waiting = 0; spin_lock(&delayed_refs->lock); +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; -- cgit v1.2.3 From bed92eae26ccf280d1a2168b7509447b56675a27 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 28 Jun 2012 18:03:02 +0200 Subject: Btrfs: qgroup implementation and prototypes Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 46 ++ fs/btrfs/extent-tree.c | 34 ++ fs/btrfs/ioctl.h | 24 + fs/btrfs/qgroup.c | 1571 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.c | 2 + fs/btrfs/transaction.h | 3 + 7 files changed, 1681 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/qgroup.c (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0c4fa2befae..0bc4d3a10a5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o + reada.o backref.o ulist.o qgroup.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a5269d4a164..ccba9b684c9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2830,6 +2830,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3339,6 +3341,50 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); +/* qgroup.c */ +struct qgroup_update { + struct list_head list; + struct btrfs_delayed_ref_node *node; + struct btrfs_delayed_extent_op *extent_op; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b13f1fbc373..1a63b830846 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2409,6 +2409,40 @@ static u64 find_middle(struct rb_root *root) } #endif +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct qgroup_update *qgroup_update; + int ret = 0; + + if (list_empty(&trans->qgroup_ref_list) != + !trans->delayed_ref_elem.seq) { + /* list without seq or seq without list */ + printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); + } + + if (!trans->delayed_ref_elem.seq) + return 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + qgroup_update = list_first_entry(&trans->qgroup_ref_list, + struct qgroup_update, list); + list_del(&qgroup_update->list); + if (!ret) + ret = btrfs_qgroup_account_ref( + trans, fs_info, qgroup_update->node, + qgroup_update->extent_op); + kfree(qgroup_update); + } + + btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e440aa653c3..a8a2230f4c5 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -35,6 +35,30 @@ struct btrfs_ioctl_vol_args { #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 +#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) + +struct btrfs_qgroup_limit { + __u64 flags; + __u64 max_rfer; + __u64 max_excl; + __u64 rsv_rfer; + __u64 rsv_excl; +}; + +struct btrfs_qgroup_inherit { + __u64 flags; + __u64 num_qgroups; + __u64 num_ref_copies; + __u64 num_excl_copies; + struct btrfs_qgroup_limit lim; + __u64 qgroups[0]; +}; + +struct btrfs_ioctl_qgroup_limit_args { + __u64 qgroupid; + struct btrfs_qgroup_limit lim; +}; + #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { __s64 fd; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c new file mode 100644 index 00000000000..bc424ae5a81 --- /dev/null +++ b/fs/btrfs/qgroup.c @@ -0,0 +1,1571 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "ulist.h" +#include "ioctl.h" +#include "backref.h" + +/* TODO XXX FIXME + * - subvol delete -> delete when ref goes to 0? delete limits also? + * - reorganize keys + * - compressed + * - sync + * - rescan + * - copy also limits on subvol creation + * - limit + * - caches fuer ulists + * - performance benchmarks + * - check all ioctl parameters + */ + +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + */ + u64 tag; + u64 refcnt; +}; + +/* + * glue structure to represent the relations between qgroups. + */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node *n = fs_info->qgroup_tree.rb_node; + struct btrfs_qgroup *qgroup; + + while (n) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + if (qgroup->qgroupid < qgroupid) + n = n->rb_left; + else if (qgroup->qgroupid > qgroupid) + n = n->rb_right; + else + return qgroup; + } + return NULL; +} + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node **p = &fs_info->qgroup_tree.rb_node; + struct rb_node *parent = NULL; + struct btrfs_qgroup *qgroup; + + while (*p) { + parent = *p; + qgroup = rb_entry(parent, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < qgroupid) + p = &(*p)->rb_left; + else if (qgroup->qgroupid > qgroupid) + p = &(*p)->rb_right; + else + return qgroup; + } + + qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); + if (!qgroup) + return ERR_PTR(-ENOMEM); + + qgroup->qgroupid = qgroupid; + INIT_LIST_HEAD(&qgroup->groups); + INIT_LIST_HEAD(&qgroup->members); + INIT_LIST_HEAD(&qgroup->dirty); + + rb_link_node(&qgroup->node, parent, p); + rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); + + return qgroup; +} + +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + struct btrfs_qgroup_list *list; + + if (!qgroup) + return -ENOENT; + + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + list_del(&qgroup->dirty); + + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + kfree(qgroup); + + return 0; +} + +/* must be called with qgroup_lock held */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list = kzalloc(sizeof(*list), GFP_ATOMIC); + if (!list) + return -ENOMEM; + + list->group = parent; + list->member = member; + list_add_tail(&list->next_group, &member->groups); + list_add_tail(&list->next_member, &parent->members); + + return 0; +} + +/* must be called with qgroup_lock held */ +static int del_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + return 0; + } + } + return -ENOENT; +} + +/* + * The full config is read in one go, only called from open_ctree() + * It doesn't use any locking, as at this point we're still single-threaded + */ +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path = NULL; + struct extent_buffer *l; + int slot; + int ret = 0; + u64 flags = 0; + + if (!fs_info->quota_enabled) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* default this to quota off, in case no status key is found */ + fs_info->qgroup_flags = 0; + + /* + * pass 1: read status, all qgroup infos and limits + */ + key.objectid = 0; + key.type = 0; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); + if (ret) + goto out; + + while (1) { + struct btrfs_qgroup *qgroup; + + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { + struct btrfs_qgroup_status_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_status_item); + + if (btrfs_qgroup_status_version(l, ptr) != + BTRFS_QGROUP_STATUS_VERSION) { + printk(KERN_ERR + "btrfs: old qgroup version, quota disabled\n"); + goto out; + } + if (btrfs_qgroup_status_generation(l, ptr) != + fs_info->generation) { + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + printk(KERN_ERR + "btrfs: qgroup generation mismatch, " + "marked as inconsistent\n"); + } + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, + ptr); + /* FIXME read scan element */ + goto next1; + } + + if (found_key.type != BTRFS_QGROUP_INFO_KEY && + found_key.type != BTRFS_QGROUP_LIMIT_KEY) + goto next1; + + qgroup = find_qgroup_rb(fs_info, found_key.offset); + if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { + printk(KERN_ERR "btrfs: inconsitent qgroup config\n"); + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + if (!qgroup) { + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out; + } + } + switch (found_key.type) { + case BTRFS_QGROUP_INFO_KEY: { + struct btrfs_qgroup_info_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_info_item); + qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); + qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); + qgroup->excl = btrfs_qgroup_info_excl(l, ptr); + qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); + /* generation currently unused */ + break; + } + case BTRFS_QGROUP_LIMIT_KEY: { + struct btrfs_qgroup_limit_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_limit_item); + qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); + qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); + qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); + qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); + qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); + break; + } + } +next1: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } + btrfs_release_path(path); + + /* + * pass 2: read all qgroup relations + */ + key.objectid = 0; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); + if (ret) + goto out; + while (1) { + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type != BTRFS_QGROUP_RELATION_KEY) + goto next2; + + if (found_key.objectid > found_key.offset) { + /* parent <- member, not needed to build config */ + /* FIXME should we omit the key completely? */ + goto next2; + } + + ret = add_relation_rb(fs_info, found_key.objectid, + found_key.offset); + if (ret) + goto out; +next2: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } +out: + fs_info->qgroup_flags |= flags; + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + } + btrfs_free_path(path); + + return ret < 0 ? ret : 0; +} + +/* + * This is only called from close_ctree() or open_ctree(), both in single- + * treaded paths. Clean up the in-memory structures. No locking needed. + */ +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; + + while ((n = rb_first(&fs_info->qgroup_tree))) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + rb_erase(n, &fs_info->qgroup_tree); + + WARN_ON(!list_empty(&qgroup->dirty)); + + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, + next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, + next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + kfree(qgroup); + } +} + +static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); +out: + btrfs_free_path(path); + return ret; +} + +static int add_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_qgroup_info_item *qgroup_info; + struct btrfs_qgroup_limit_item *qgroup_limit; + struct extent_buffer *leaf; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + qgroup_info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); + + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_limit)); + if (ret) + goto out; + + leaf = path->nodes[0]; + qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); + + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 qgroupid, + u64 flags, u64 max_rfer, u64 max_excl, + u64 rsv_rfer, u64 rsv_excl) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_limit_item *qgroup_limit; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_LIMIT_KEY; + key.offset = qgroupid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_limit = btrfs_item_ptr(l, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_info_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_info_item *qgroup_info; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_info = btrfs_item_ptr(l, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); + btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); + btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); + btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_status_item(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_status_item *ptr; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_generation(l, ptr, trans->transid); + /* XXX scan */ + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called with qgroup_lock held + */ +static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (!root) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = 0; + key.offset = 0; + key.type = 0; + + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + break; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + ret = 0; +out: + root->fs_info->pending_quota_state = 0; + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root; + struct btrfs_path *path = NULL; + struct btrfs_qgroup_status_item *ptr; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret = 0; + + spin_lock(&fs_info->qgroup_lock); + if (fs_info->quota_root) { + fs_info->pending_quota_state = 1; + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + spin_unlock(&fs_info->qgroup_lock); + + /* + * initially create the quota tree + */ + quota_root = btrfs_create_tree(trans, fs_info, + BTRFS_QUOTA_TREE_OBJECTID); + if (IS_ERR(quota_root)) { + ret = PTR_ERR(quota_root); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*ptr)); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); + btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_scan(leaf, ptr, 0); + + btrfs_mark_buffer_dirty(leaf); + + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + fs_info->pending_quota_state = 1; + spin_unlock(&fs_info->qgroup_lock); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *quota_root; + int ret = 0; + + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + quota_root = fs_info->quota_root; + fs_info->quota_root = NULL; + btrfs_free_qgroup_config(fs_info); + spin_unlock(&fs_info->qgroup_lock); + + if (!quota_root) + return -EINVAL; + + ret = btrfs_clean_quota_tree(trans, quota_root); + if (ret) + goto out; + + ret = btrfs_del_root(trans, tree_root, "a_root->root_key); + if (ret) + goto out; + + list_del("a_root->dirty_list); + + btrfs_tree_lock(quota_root->node); + clean_tree_block(trans, tree_root, quota_root->node); + btrfs_tree_unlock(quota_root->node); + btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); + + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); +out: + return ret; +} + +int btrfs_quota_rescan(struct btrfs_fs_info *fs_info) +{ + /* FIXME */ + return 0; +} + +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + int ret = 0; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = add_qgroup_relation_item(trans, quota_root, src, dst); + if (ret) + return ret; + + ret = add_qgroup_relation_item(trans, quota_root, dst, src); + if (ret) { + del_qgroup_relation_item(trans, quota_root, src, dst); + return ret; + } + + spin_lock(&fs_info->qgroup_lock); + ret = add_relation_rb(quota_root->fs_info, src, dst); + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + int ret = 0; + int err; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = del_qgroup_relation_item(trans, quota_root, src, dst); + err = del_qgroup_relation_item(trans, quota_root, dst, src); + if (err && !ret) + ret = err; + + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = add_qgroup_item(trans, quota_root, qgroupid); + + spin_lock(&fs_info->qgroup_lock); + qgroup = add_qgroup_rb(fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); + + if (IS_ERR(qgroup)) + ret = PTR_ERR(qgroup); + + return ret; +} + +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_root *quota_root; + int ret = 0; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = del_qgroup_item(trans, quota_root, qgroupid); + + spin_lock(&fs_info->qgroup_lock); + del_qgroup_rb(quota_root->fs_info, qgroupid); + + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + if (!quota_root) + return -EINVAL; + + ret = update_qgroup_limit_item(trans, quota_root, qgroupid, + limit->flags, limit->max_rfer, + limit->max_excl, limit->rsv_rfer, + limit->rsv_excl); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + printk(KERN_INFO "unable to update quota limit for %llu\n", + (unsigned long long)qgroupid); + } + + spin_lock(&fs_info->qgroup_lock); + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto unlock; + } + qgroup->lim_flags = limit->flags; + qgroup->max_rfer = limit->max_rfer; + qgroup->max_excl = limit->max_excl; + qgroup->rsv_rfer = limit->rsv_rfer; + qgroup->rsv_excl = limit->rsv_excl; + +unlock: + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); +} + +/* + * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts + * the modification into a list that's later used by btrfs_end_transaction to + * pass the recorded modifications on to btrfs_qgroup_account_ref. + */ +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct qgroup_update *u; + + BUG_ON(!trans->delayed_ref_elem.seq); + u = kmalloc(sizeof(*u), GFP_NOFS); + if (!u) + return -ENOMEM; + + u->node = node; + u->extent_op = extent_op; + list_add_tail(&u->list, &trans->qgroup_ref_list); + + return 0; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key ins; + struct btrfs_root *quota_root; + u64 ref_root; + struct btrfs_qgroup *qgroup; + struct ulist_node *unode; + struct ulist *roots = NULL; + struct ulist *tmp = NULL; + struct ulist_iterator uiter; + u64 seq; + int ret = 0; + int sgn; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) { + struct btrfs_delayed_tree_ref *ref; + ref = btrfs_delayed_node_to_tree_ref(node); + ref_root = ref->root; + } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_delayed_data_ref *ref; + ref = btrfs_delayed_node_to_data_ref(node); + ref_root = ref->root; + } else { + BUG(); + } + + if (!is_fstree(ref_root)) { + /* + * non-fs-trees are not being accounted + */ + return 0; + } + + switch (node->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + case BTRFS_UPDATE_DELAYED_HEAD: + return 0; + default: + BUG(); + } + + /* + * the delayed ref sequence number we pass depends on the direction of + * the operation. for add operations, we pass (node->seq - 1) to skip + * the delayed ref's current sequence number, because we need the state + * of the tree before the add operation. for delete operations, we pass + * (node->seq) to include the delayed ref's current sequence number, + * because we need the state of the tree after the delete operation. + */ + ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, + sgn > 0 ? node->seq - 1 : node->seq, &roots); + if (ret < 0) + goto out; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + if (!quota_root) + goto unlock; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto unlock; + + /* + * step 1: for each old ref, visit all nodes once and inc refcnt + */ + tmp = ulist_alloc(GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto unlock; + } + seq = fs_info->qgroup_seq; + fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + /* XXX id not needed */ + ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)tmp_unode->aux; + if (qg->refcnt < seq) + qg->refcnt = seq + 1; + else + ++qg->refcnt; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(tmp, glist->group->qgroupid, + (unsigned long)glist->group, + GFP_ATOMIC); + } + } + } + + /* + * step 2: walk from the new root + */ + ulist_reinit(tmp); + ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)unode->aux; + if (qg->refcnt < seq) { + /* not visited by step 1 */ + qg->rfer += sgn * node->num_bytes; + qg->rfer_cmpr += sgn * node->num_bytes; + if (roots->nnodes == 0) { + qg->excl += sgn * node->num_bytes; + qg->excl_cmpr += sgn * node->num_bytes; + } + qgroup_dirty(fs_info, qg); + } + WARN_ON(qg->tag >= seq); + qg->tag = seq; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(tmp, glist->group->qgroupid, + (unsigned long)glist->group, GFP_ATOMIC); + } + } + + /* + * step 3: walk again from old refs + */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + struct btrfs_qgroup *qg; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)tmp_unode->aux; + if (qg->tag == seq) + continue; + + if (qg->refcnt - seq == roots->nnodes) { + qg->excl -= sgn * node->num_bytes; + qg->excl_cmpr -= sgn * node->num_bytes; + qgroup_dirty(fs_info, qg); + } + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(tmp, glist->group->qgroupid, + (unsigned long)glist->group, + GFP_ATOMIC); + } + } + } + ret = 0; +unlock: + spin_unlock(&fs_info->qgroup_lock); +out: + ulist_free(roots); + ulist_free(tmp); + + return ret; +} + +/* + * called from commit_transaction. Writes all changed qgroups to disk. + */ +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + int ret = 0; + + if (!quota_root) + goto out; + + fs_info->quota_enabled = fs_info->pending_quota_state; + + spin_lock(&fs_info->qgroup_lock); + while (!list_empty(&fs_info->dirty_qgroups)) { + struct btrfs_qgroup *qgroup; + qgroup = list_first_entry(&fs_info->dirty_qgroups, + struct btrfs_qgroup, dirty); + list_del_init(&qgroup->dirty); + spin_unlock(&fs_info->qgroup_lock); + ret = update_qgroup_info_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + spin_lock(&fs_info->qgroup_lock); + } + if (fs_info->quota_enabled) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; + else + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_status_item(trans, fs_info, quota_root); + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + +out: + + return ret; +} + +/* + * copy the acounting information between qgroups. This is necessary when a + * snapshot or a subvolume is created + */ +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit) +{ + int ret = 0; + int i; + u64 *i_qgroups; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *srcgroup; + struct btrfs_qgroup *dstgroup; + u32 level_size = 0; + + if (!fs_info->quota_enabled) + return 0; + + if (!quota_root) + return -EINVAL; + + /* + * create a tracking group for the subvol itself + */ + ret = add_qgroup_item(trans, quota_root, objectid); + if (ret) + goto out; + + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + ret = update_qgroup_limit_item(trans, quota_root, objectid, + inherit->lim.flags, + inherit->lim.max_rfer, + inherit->lim.max_excl, + inherit->lim.rsv_rfer, + inherit->lim.rsv_excl); + if (ret) + goto out; + } + + if (srcid) { + struct btrfs_root *srcroot; + struct btrfs_key srckey; + int srcroot_level; + + srckey.objectid = srcid; + srckey.type = BTRFS_ROOT_ITEM_KEY; + srckey.offset = (u64)-1; + srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey); + if (IS_ERR(srcroot)) { + ret = PTR_ERR(srcroot); + goto out; + } + + rcu_read_lock(); + srcroot_level = btrfs_header_level(srcroot->node); + level_size = btrfs_level_size(srcroot, srcroot_level); + rcu_read_unlock(); + } + + /* + * add qgroup to all inherited groups + */ + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_qgroup_relation_item(trans, quota_root, + objectid, *i_qgroups); + if (ret) + goto out; + ret = add_qgroup_relation_item(trans, quota_root, + *i_qgroups, objectid); + if (ret) + goto out; + ++i_qgroups; + } + } + + + spin_lock(&fs_info->qgroup_lock); + + dstgroup = add_qgroup_rb(fs_info, objectid); + if (!dstgroup) + goto unlock; + + if (srcid) { + srcgroup = find_qgroup_rb(fs_info, srcid); + if (!srcgroup) + goto unlock; + dstgroup->rfer = srcgroup->rfer - level_size; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + srcgroup->excl = level_size; + srcgroup->excl_cmpr = level_size; + qgroup_dirty(fs_info, dstgroup); + qgroup_dirty(fs_info, srcgroup); + } + + if (!inherit) + goto unlock; + + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + ++i_qgroups; + } + + for (i = 0; i < inherit->num_ref_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->rfer = src->rfer - level_size; + dst->rfer_cmpr = src->rfer_cmpr - level_size; + i_qgroups += 2; + } + for (i = 0; i < inherit->num_excl_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->excl = src->excl + level_size; + dst->excl_cmpr = src->excl_cmpr + level_size; + i_qgroups += 2; + } + +unlock: + spin_unlock(&fs_info->qgroup_lock); +out: + return ret; +} + +/* + * reserve some space for a qgroup and all its parents. The reservation takes + * place with start_transaction or dealloc_reserve, similar to ENOSPC + * accounting. If not enough space is available, EDQUOT is returned. + * We assume that the requested space is new for all qgroups. + */ +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 ref_root = root->root_key.objectid; + int ret = 0; + struct ulist *ulist = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + + if (!is_fstree(ref_root)) + return 0; + + if (num_bytes == 0) + return 0; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + /* + * in a first step, we check all affected qgroups if any limits would + * be exceeded + */ + ulist = ulist_alloc(GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)unode->aux; + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && + qg->reserved + qg->rfer + num_bytes > + qg->max_rfer) + ret = -EDQUOT; + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && + qg->reserved + qg->excl + num_bytes > + qg->max_excl) + ret = -EDQUOT; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(ulist, glist->group->qgroupid, + (unsigned long)glist->group, GFP_ATOMIC); + } + } + if (ret) + goto out; + + /* + * no limits exceeded, now record the reservation into all qgroups + */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(ulist, &uiter))) { + struct btrfs_qgroup *qg; + + qg = (struct btrfs_qgroup *)unode->aux; + + qg->reserved += num_bytes; + } + +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(ulist); + + return ret; +} + +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *ulist = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + u64 ref_root = root->root_key.objectid; + + if (!is_fstree(ref_root)) + return; + + if (num_bytes == 0) + return; + + spin_lock(&fs_info->qgroup_lock); + + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + ulist = ulist_alloc(GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)unode->aux; + + qg->reserved -= num_bytes; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(ulist, glist->group->qgroupid, + (unsigned long)glist->group, GFP_ATOMIC); + } + } + +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(ulist); +} + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) +{ + if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) + return; + printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n", + trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 23cbda0685b..0d6c8816845 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -351,6 +351,8 @@ again: h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; + h->delayed_ref_elem.seq = 0; + INIT_LIST_HEAD(&h->qgroup_ref_list); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 010729446e1..16ba00842c3 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -20,6 +20,7 @@ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" #include "delayed-ref.h" +#include "ctree.h" struct btrfs_transaction { u64 transid; @@ -63,6 +64,8 @@ struct btrfs_trans_handle { * Subvolume quota depends on this */ struct btrfs_root *root; + struct seq_list delayed_ref_elem; + struct list_head qgroup_ref_list; }; struct btrfs_pending_snapshot { -- cgit v1.2.3 From edf39272db4810282360f7362d43ade1d524c913 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 28 Jun 2012 18:04:55 +0200 Subject: Btrfs: call the qgroup accounting functions Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 3 +++ fs/btrfs/transaction.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1a63b830846..c08337a83ac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2479,6 +2479,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: @@ -2588,6 +2590,7 @@ again: } out: spin_unlock(&delayed_refs->lock); + assert_qgroups_uptodate(trans); return 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0d6c8816845..d20d2e24f8d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -512,6 +512,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + /* + * do the qgroup accounting as early as possible + */ + err = btrfs_delayed_refs_qgroup_accounting(trans, info); + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; /* @@ -571,6 +576,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { err = -EIO; } + assert_qgroups_uptodate(trans); memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -1355,6 +1361,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; + /* + * running the delayed items may have added new refs. account + * them now so that they hinder processing of more delayed refs + * as little as possible. + */ + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + /* * rename don't use btrfs_join_transaction, so, once we * set the transaction to blocked above, we aren't going @@ -1467,6 +1480,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->chunk_root->node); switch_commit_root(root->fs_info->chunk_root); + assert_qgroups_uptodate(trans); update_super_roots(root); if (!root->fs_info->log_root_recovering) { -- cgit v1.2.3 From bcef60f249034f69e89e544461cbfecb68975595 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 15:23:30 +0200 Subject: Btrfs: quota tree support and startup Init the quota tree along with the others on open_ctree and close_ctree. Add the quota tree to the list of well known trees in btrfs_read_fs_root_no_name. Signed-off-by: Arne Jansen --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ccba9b684c9..2ba03b96fbe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2967,6 +2967,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info->quota_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kfree(fs_info); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eca05497442..87d9391c057 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1472,6 +1472,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; + if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + return fs_info->quota_root ? fs_info->quota_root : + ERR_PTR(-ENOENT); again: spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, @@ -1899,6 +1902,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_extent_buffer(info->extent_root->commit_root); free_extent_buffer(info->csum_root->node); free_extent_buffer(info->csum_root->commit_root); + if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); + } info->tree_root->node = NULL; info->tree_root->commit_root = NULL; @@ -1908,6 +1915,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) info->extent_root->commit_root = NULL; info->csum_root->node = NULL; info->csum_root->commit_root = NULL; + if (info->quota_root) { + info->quota_root->node = NULL; + info->quota_root->commit_root = NULL; + } if (chunk_root) { free_extent_buffer(info->chunk_root->node); @@ -1938,6 +1949,7 @@ int open_ctree(struct super_block *sb, struct btrfs_root *csum_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; + struct btrfs_root *quota_root; struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; @@ -1949,9 +1961,10 @@ int open_ctree(struct super_block *sb, csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); + quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info); if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root) { + !chunk_root || !dev_root || !quota_root) { err = -ENOMEM; goto fail; } @@ -2441,6 +2454,17 @@ retry_root_backup: goto recovery_tree_root; csum_root->track_dirty = 1; + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_QUOTA_TREE_OBJECTID, quota_root); + if (ret) { + kfree(quota_root); + quota_root = fs_info->quota_root = NULL; + } else { + quota_root->track_dirty = 1; + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + } + fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2500,6 +2524,9 @@ retry_root_backup: " integrity check module %s\n", sb->s_id); } #endif + ret = btrfs_read_qgroup_config(fs_info); + if (ret) + goto fail_trans_kthread; /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && @@ -2510,7 +2537,7 @@ retry_root_backup: printk(KERN_WARNING "Btrfs log replay required " "on RO media\n"); err = -EIO; - goto fail_trans_kthread; + goto fail_qgroup; } blocksize = btrfs_level_size(tree_root, @@ -2519,7 +2546,7 @@ retry_root_backup: log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { err = -ENOMEM; - goto fail_trans_kthread; + goto fail_qgroup; } __setup_root(nodesize, leafsize, sectorsize, stripesize, @@ -2559,7 +2586,7 @@ retry_root_backup: printk(KERN_WARNING "btrfs: failed to recover relocation\n"); err = -EINVAL; - goto fail_trans_kthread; + goto fail_qgroup; } } @@ -2569,10 +2596,10 @@ retry_root_backup: fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) - goto fail_trans_kthread; + goto fail_qgroup; if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) @@ -2596,6 +2623,8 @@ retry_root_backup: return 0; +fail_qgroup: + btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); fail_cleaner: @@ -3194,6 +3223,8 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); + btrfs_free_qgroup_config(root->fs_info); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); @@ -3213,6 +3244,10 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->dev_root->commit_root); free_extent_buffer(fs_info->csum_root->node); free_extent_buffer(fs_info->csum_root->commit_root); + if (fs_info->quota_root) { + free_extent_buffer(fs_info->quota_root->node); + free_extent_buffer(fs_info->quota_root->commit_root); + } btrfs_free_block_groups(fs_info); -- cgit v1.2.3 From 546adb0d817c34dc2be3a7cb5bba8771f837a562 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 14 Jun 2012 16:37:44 +0200 Subject: Btrfs: hooks for qgroup to record delayed refs Hooks into qgroup code to record refs and into transaction commit. This is the main entry point for qgroup. Basically every change in extent backrefs got accounted to the appropriate qgroups. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/delayed-ref.c | 16 ++++++++++------ fs/btrfs/delayed-ref.h | 19 +++++++++++++++++++ fs/btrfs/transaction.c | 7 +++++++ 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 21a75771763..da7419ed01b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -529,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (is_fstree(ref_root)) - seq = btrfs_inc_tree_mod_seq(fs_info); + if (need_ref_seq(for_cow, ref_root)) + seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -588,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (is_fstree(ref_root)) - seq = btrfs_inc_tree_mod_seq(fs_info); + if (need_ref_seq(for_cow, ref_root)) + seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -662,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!is_fstree(ref_root) && + if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&fs_info->tree_mod_seq_wait)) wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); + if (need_ref_seq(for_cow, ref_root)) + btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -711,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!is_fstree(ref_root) && + if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&fs_info->tree_mod_seq_wait)) wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); + if (need_ref_seq(for_cow, ref_root)) + btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 2b5cb27f986..0d7c90c366b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -179,6 +179,25 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq); +/* + * delayed refs with a ref_seq > 0 must be held back during backref walking. + * this only applies to items in one of the fs-trees. for_cow items never need + * to be held back, so they won't get a ref_seq number. + */ +static inline int need_ref_seq(int for_cow, u64 rootid) +{ + if (for_cow) + return 0; + + if (rootid == BTRFS_FS_TREE_OBJECTID) + return 1; + + if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + + return 0; +} + /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d20d2e24f8d..21c768cb443 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -795,6 +795,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = btrfs_run_dev_stats(trans, root->fs_info); BUG_ON(ret); + ret = btrfs_run_qgroups(trans, root->fs_info); + BUG_ON(ret); + + /* run_qgroups might have added some more refs */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); -- cgit v1.2.3 From c556723794b3487a79de1ecd6354975b1389f5ff Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 14 Sep 2011 15:44:05 +0200 Subject: Btrfs: hooks to reserve qgroup space Like block reserves, reserve a small piece of space on each transaction start and for delalloc. These are the hooks that can actually return EDQUOT to the user. The amount of space reserved is tracked in the transaction handle. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 12 ++++++++++++ fs/btrfs/transaction.c | 16 ++++++++++++++++ fs/btrfs/transaction.h | 1 + 3 files changed, 29 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c08337a83ac..2ce16f97730 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4565,6 +4565,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + if (ret) + return ret; + } + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; @@ -4643,6 +4650,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 21c768cb443..f1e29fbd531 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -295,6 +295,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_transaction *cur_trans; u64 num_bytes = 0; int ret; + u64 qgroup_reserved = 0; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return ERR_PTR(-EROFS); @@ -313,6 +314,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { + if (root->fs_info->quota_enabled && + is_fstree(root->root_key.objectid)) { + qgroup_reserved = num_items * root->leafsize; + ret = btrfs_qgroup_reserve(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); + } + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, @@ -351,6 +360,7 @@ again: h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; + h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; INIT_LIST_HEAD(&h->qgroup_ref_list); @@ -524,6 +534,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, * end_transaction. Subvolume quota depends on this. */ WARN_ON(trans->root != root); + + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 16ba00842c3..2759e0572c5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -50,6 +50,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; -- cgit v1.2.3 From 5d13a37bd5327220e13329943d1228acfbe5934a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 14 Sep 2011 15:53:51 +0200 Subject: Btrfs: add qgroup ioctls Ioctls to control the qgroup feature like adding and removing qgroups and assigning qgroups. Signed-off-by: Arne Jansen --- fs/btrfs/ioctl.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 27 ++++++++ 2 files changed, 212 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e92e576300..55a7283a9e1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3390,6 +3390,183 @@ out: return ret; } +static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_quota_ctl_args *sa; + struct btrfs_trans_handle *trans = NULL; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + } + + switch (sa->cmd) { + case BTRFS_QUOTA_CTL_ENABLE: + ret = btrfs_quota_enable(trans, root->fs_info); + break; + case BTRFS_QUOTA_CTL_DISABLE: + ret = btrfs_quota_disable(trans, root->fs_info); + break; + case BTRFS_QUOTA_CTL_RESCAN: + ret = btrfs_quota_rescan(root->fs_info); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + if (trans) { + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + } + +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->assign) { + ret = btrfs_add_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } else { + ret = btrfs_del_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_qgroup_create_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->create) { + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, + NULL); + } else { + ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_qgroup_limit_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + u64 qgroupid; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + qgroupid = sa->qgroupid; + if (!qgroupid) { + /* take the current subvol as qgroup */ + qgroupid = root->root_key.objectid; + } + + /* FIXME: check if the IDs really exist */ + ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3476,6 +3653,14 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_dev_stats(root, argp, 0); case BTRFS_IOC_GET_AND_RESET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp, 1); + case BTRFS_IOC_QUOTA_CTL: + return btrfs_ioctl_quota_ctl(root, argp); + case BTRFS_IOC_QGROUP_ASSIGN: + return btrfs_ioctl_qgroup_assign(root, argp); + case BTRFS_IOC_QGROUP_CREATE: + return btrfs_ioctl_qgroup_create(root, argp); + case BTRFS_IOC_QGROUP_LIMIT: + return btrfs_ioctl_qgroup_limit(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index a8a2230f4c5..9dd50c4656b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -319,6 +319,25 @@ struct btrfs_ioctl_get_dev_stats { __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ }; +#define BTRFS_QUOTA_CTL_ENABLE 1 +#define BTRFS_QUOTA_CTL_DISABLE 2 +#define BTRFS_QUOTA_CTL_RESCAN 3 +struct btrfs_ioctl_quota_ctl_args { + __u64 cmd; + __u64 status; +}; + +struct btrfs_ioctl_qgroup_assign_args { + __u64 assign; + __u64 src; + __u64 dst; +}; + +struct btrfs_ioctl_qgroup_create_args { + __u64 create; + __u64 qgroupid; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -388,4 +407,12 @@ struct btrfs_ioctl_get_dev_stats { #define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ + struct btrfs_ioctl_quota_ctl_args) +#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ + struct btrfs_ioctl_qgroup_assign_args) +#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ + struct btrfs_ioctl_qgroup_create_args) +#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ + struct btrfs_ioctl_qgroup_limit_args) #endif -- cgit v1.2.3 From 6f72c7e20dbaea55f04546de69586c84a3654503 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 14 Sep 2011 15:58:21 +0200 Subject: Btrfs: add qgroup inheritance When creating a subvolume or snapshot, it is necessary to initialize the qgroup account with a copy of some other (tracking) qgroup. This patch adds parameters to the ioctls to pass the information from which qgroup to inherit. Signed-off-by: Arne Jansen --- fs/btrfs/ioctl.c | 59 +++++++++++++++++++++++++++++++++++--------------- fs/btrfs/ioctl.h | 11 +++++++++- fs/btrfs/transaction.c | 8 +++++++ fs/btrfs/transaction.h | 1 + 4 files changed, 61 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 55a7283a9e1..1dffd0adf97 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -336,7 +336,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, - u64 *async_transid) + u64 *async_transid, + struct btrfs_qgroup_inherit **inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -368,6 +369,11 @@ static noinline int create_subvol(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, + inherit ? *inherit : NULL); + if (ret) + goto fail; + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { @@ -484,7 +490,7 @@ fail: static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - bool readonly) + bool readonly, struct btrfs_qgroup_inherit **inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -502,6 +508,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; + if (inherit) { + pending_snapshot->inherit = *inherit; + *inherit = NULL; /* take responsibility to free it */ + } trans = btrfs_start_transaction(root->fs_info->extent_root, 5); if (IS_ERR(trans)) { @@ -635,7 +645,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid, bool readonly) + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit **inherit) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -666,11 +677,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen, async_transid, readonly); + error = create_snapshot(snap_src, dentry, name, namelen, + async_transid, readonly, inherit); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid); + name, namelen, async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); @@ -1379,11 +1390,9 @@ out: } static noinline int btrfs_ioctl_snap_create_transid(struct file *file, - char *name, - unsigned long fd, - int subvol, - u64 *transid, - bool readonly) + char *name, unsigned long fd, int subvol, + u64 *transid, bool readonly, + struct btrfs_qgroup_inherit **inherit) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct file *src_file; @@ -1407,7 +1416,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid, readonly); + NULL, transid, readonly, inherit); } else { struct inode *src_inode; src_file = fget(fd); @@ -1426,7 +1435,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid, readonly); + transid, readonly, inherit); fput(src_file); } out: @@ -1446,7 +1455,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, - NULL, false); + NULL, false, NULL); kfree(vol_args); return ret; @@ -1460,6 +1469,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, u64 transid = 0; u64 *ptr = NULL; bool readonly = false; + struct btrfs_qgroup_inherit *inherit = NULL; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) @@ -1467,7 +1477,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; if (vol_args->flags & - ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | + BTRFS_SUBVOL_QGROUP_INHERIT)) { ret = -EOPNOTSUPP; goto out; } @@ -1476,10 +1487,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ptr = &transid; if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; + if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { + if (vol_args->size > PAGE_CACHE_SIZE) { + ret = -EINVAL; + goto out; + } + inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); + if (IS_ERR(inherit)) { + ret = PTR_ERR(inherit); + goto out; + } + } ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - ptr, readonly); + vol_args->fd, subvol, ptr, + readonly, &inherit); if (ret == 0 && ptr && copy_to_user(arg + @@ -1488,6 +1510,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = -EFAULT; out: kfree(vol_args); + kfree(inherit); return ret; } @@ -3588,6 +3611,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SUBVOL_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); case BTRFS_IOC_SUBVOL_GETFLAGS: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 9dd50c4656b..cdda57f1c24 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -32,6 +32,7 @@ struct btrfs_ioctl_vol_args { #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) +#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 @@ -64,7 +65,13 @@ struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; __u64 flags; - __u64 unused[4]; + union { + struct { + __u64 size; + struct btrfs_qgroup_inherit __user *qgroup_inherit; + }; + __u64 unused[4]; + }; char name[BTRFS_SUBVOL_NAME_MAX + 1]; }; @@ -382,6 +389,8 @@ struct btrfs_ioctl_qgroup_create_args { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ + struct btrfs_ioctl_vol_args_v2) #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f1e29fbd531..127283913a4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -990,6 +990,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } + ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, + objectid, pending->inherit); + kfree(pending->inherit); + if (ret) { + pending->error = ret; + goto fail; + } + key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 2759e0572c5..cca315dcdfc 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -73,6 +73,7 @@ struct btrfs_pending_snapshot { struct dentry *dentry; struct btrfs_root *root; struct btrfs_root *snap; + struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; /* extra metadata reseration for relocation */ -- cgit v1.2.3 From 10983f2e8dc65d118371681548809109b570b63b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 11 Jul 2012 15:26:19 +0800 Subject: Btrfs: fix typo in convert_extent_bit It should be convert_extent_bit. Signed-off-by: Liu Bo Signed-off-by: Jiri Kosina --- fs/btrfs/extent_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9018a05036..97f6703fd49 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -933,7 +933,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, /** - * convert_extent - convert all bits in a given range from one bit to another + * convert_extent_bit - convert all bits in a given range from one bit to + * another * @tree: the io tree to search * @start: the start offset in bytes * @end: the end offset in bytes (inclusive) -- cgit v1.2.3 From b3d9b7a3c752dc4b6976a4ff7b8298887a5b734d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 9 Jun 2012 13:51:19 -0400 Subject: vfs: switch i_dentry/d_alias to hlist Signed-off-by: Al Viro --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7d1921ac76..a101572f1ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6987,7 +6987,7 @@ void btrfs_destroy_inode(struct inode *inode) struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(!hlist_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); -- cgit v1.2.3 From 00cd8dd3bf95f2cc8435b4cac01d9995635c6d0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 17:13:09 -0400 Subject: stop passing nameidata to ->lookup() Just the flags; only NFS cares even about that, but there are legitimate uses for such argument. And getting rid of that completely would require splitting ->lookup() into a couple of methods (at least), so let's leave that alone for now... Signed-off-by: Al Viro --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a101572f1ce..e5f1f81b2d6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4247,7 +4247,7 @@ static void btrfs_dentry_release(struct dentry *dentry) } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct dentry *ret; -- cgit v1.2.3 From ebfc3b49a7ac25920cb5be5445f602e51d2ea559 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 18:05:36 -0400 Subject: don't pass nameidata to ->create() boolean "does it have to be exclusive?" flag is passed instead; Local filesystem should just ignore it - the object is guaranteed not to be there yet. Signed-off-by: Al Viro --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e5f1f81b2d6..fb8d671d00e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4893,7 +4893,7 @@ out_unlock: } static int btrfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; -- cgit v1.2.3 From 9249e17fe094d853d1ef7475dd559a2cc7e23d42 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Jun 2012 12:55:37 +0100 Subject: VFS: Pass mount flags to sget() Pass mount flags to sget() so that it can use them in initialising a new superblock before the set function is called. They could also be passed to the compare function. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e23991574fd..b19d7556772 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1068,7 +1068,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, + fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -1082,7 +1083,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, -- cgit v1.2.3 From 11e62a8fabd003352e852e74e1b64a437fd908c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 19 Jul 2012 11:17:49 +0400 Subject: btrfs: switch btrfs_ioctl_balance() to mnt_want_write_file() Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e92e576300..1e9f6c019ad 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3268,7 +3268,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3338,7 +3338,7 @@ out_bargs: out: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From e39e64ac0cdeca3798a6bf186f873be20e2f57b4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 23 Jul 2012 15:23:45 -0400 Subject: Btrfs: don't wait around for new log writers on an SSD Waiting on spindles improves performance, but ssds want all the IO as quickly as we can push it down. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b72b068183e..8c35847d0fe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1330,7 +1330,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); } - if (now < cur_trans->start_time || now - cur_trans->start_time < 1) + if (!btrfs_test_opt(root, SSD) && + (now < cur_trans->start_time || now - cur_trans->start_time < 1)) should_grow = 1; do { -- cgit v1.2.3 From cbea5ac1ee03197354bd38caad3fcb798f185181 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 23 Jul 2012 15:25:05 -0400 Subject: Btrfs: reduce calls to wake_up on uncontended locks The btrfs locks were unconditionally calling wake_up as the locks were released. This lead to extra thrashing on the waitqueue, especially for locks that were dominated by readers. Signed-off-by: Chris Mason --- fs/btrfs/locking.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 272f911203f..a44eff07480 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -78,13 +78,15 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); - if (atomic_dec_and_test(&eb->blocking_writers)) + if (atomic_dec_and_test(&eb->blocking_writers) && + waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); } else if (rw == BTRFS_READ_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); - if (atomic_dec_and_test(&eb->blocking_readers)) + if (atomic_dec_and_test(&eb->blocking_readers) && + waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); } return; @@ -199,7 +201,8 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); - if (atomic_dec_and_test(&eb->blocking_readers)) + if (atomic_dec_and_test(&eb->blocking_readers) && + waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); atomic_dec(&eb->read_locks); } @@ -247,8 +250,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); - smp_wmb(); - wake_up(&eb->write_lock_wq); + smp_mb(); + if (waitqueue_active(&eb->write_lock_wq)) + wake_up(&eb->write_lock_wq); } else { WARN_ON(atomic_read(&eb->spinning_writers) != 1); atomic_dec(&eb->spinning_writers); -- cgit v1.2.3 From c5c3c5f31e6af2d12b154251a7f23b7f4add6b1d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Apr 2012 14:42:44 -0400 Subject: Btrfs: remove ->dirty_inode We do all of our inode updating when we change it, and now that we do ->update_time we don't need ->dirty_inode for atime updates anymore, so just remove it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e23991574fd..ddc2efdda1a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1477,16 +1477,6 @@ static int btrfs_unfreeze(struct super_block *sb) return 0; } -static void btrfs_fs_dirty_inode(struct inode *inode, int flags) -{ - int ret; - - ret = btrfs_dirty_inode(inode); - if (ret) - printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu " - "error %d\n", btrfs_ino(inode), ret); -} - static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -1526,7 +1516,6 @@ static const struct super_operations btrfs_super_ops = { .show_options = btrfs_show_options, .show_devname = btrfs_show_devname, .write_inode = btrfs_write_inode, - .dirty_inode = btrfs_fs_dirty_inode, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, .statfs = btrfs_statfs, -- cgit v1.2.3 From 063849eafda03edf6872a3728b4a98dcc86290c7 Mon Sep 17 00:00:00 2001 From: Arnd Hannemann Date: Mon, 16 Apr 2012 15:27:51 +0200 Subject: Btrfs: allow mount -o remount,compress=no Btrfs allows to turn on compression on a mounted and used filesystem by issuing mount -o remount,compress=lzo. This patch allows to turn compression off again while the filesystem is mounted. As suggested by David Sterba if the compress-force option was set, it is implicitly cleared if compression is turned off. Tested-by: David Sterba Signed-off-by: Arnd Hannemann --- fs/btrfs/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ddc2efdda1a..88a2d2bb2d7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -396,15 +396,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) strcmp(args[0].from, "zlib") == 0) { compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; + btrfs_set_opt(info->mount_opt, COMPRESS); } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; + btrfs_set_opt(info->mount_opt, COMPRESS); + } else if (strncmp(args[0].from, "no", 2) == 0) { + compress_type = "no"; + info->compress_type = BTRFS_COMPRESS_NONE; + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); + compress_force = false; } else { ret = -EINVAL; goto out; } - btrfs_set_opt(info->mount_opt, COMPRESS); if (compress_force) { btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); pr_info("btrfs: force %s compression\n", -- cgit v1.2.3 From 2bc5565286121d2a77ccd728eb3484dff2035b58 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Fri, 15 Jun 2012 09:49:33 +0200 Subject: Btrfs: don't update atime on RO subvolumes Before the update_time inode operation was indroduced, it was not possible to prevent updates of atime on RO subvolumes. VFS was only able to check for RO on the mount, but did not know anything about btrfs subvolumes. btrfs_update_time does now check if the root is RO and skip updating of times. Signed-off-by: Alexander Block --- fs/btrfs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7d1921ac76..fcc65802f36 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4518,6 +4518,11 @@ int btrfs_dirty_inode(struct inode *inode) static int btrfs_update_time(struct inode *inode, struct timespec *now, int flags) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root)) + return -EROFS; + if (flags & S_VERSION) inode_inc_iversion(inode); if (flags & S_CTIME) -- cgit v1.2.3 From e4b50e14c8f72bcbae53809815d5df70d5aec174 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jun 2012 13:30:11 +0300 Subject: Btrfs: small naming cleanup in join_transaction() "root->fs_info" and "fs_info" are the same, but "fs_info" is prefered because it is shorter and that's what is used in the rest of the function. Signed-off-by: Dan Carpenter --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8c35847d0fe..cb2dfe29394 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -100,8 +100,8 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; - } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - spin_unlock(&root->fs_info->trans_lock); + } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; } -- cgit v1.2.3 From a43a21113365e5a9b59efc411da715d910cca87c Mon Sep 17 00:00:00 2001 From: Andrew Mahone Date: Tue, 19 Jun 2012 21:08:32 -0400 Subject: btrfs: ignore unfragmented file checks in defrag when compression enabled - rebased Rebased on btrfs-next and retested. Inform should_defrag_range if BTRFS_DEFRAG_RANGE_COMPRESS is set. If so, skip checks for adjacent extents and extent size when deciding whether to defrag, as these can prevent an uncompressed and unfragmented file from being compressed as requested. Signed-off-by: Andrew Mahone --- fs/btrfs/ioctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e92e576300..9ec23b93e01 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -832,7 +832,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) } static int should_defrag_range(struct inode *inode, u64 start, int thresh, - u64 *last_len, u64 *skip, u64 *defrag_end) + u64 *last_len, u64 *skip, u64 *defrag_end, + int compress) { struct extent_map *em; int ret = 1; @@ -863,7 +864,7 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh, * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it */ - if ((*last_len == 0 || *last_len >= thresh) && + if (!compress && (*last_len == 0 || *last_len >= thresh) && (em->len >= thresh || !next_mergeable)) ret = 0; out: @@ -1145,7 +1146,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, extent_thresh, &last_len, &skip, - &defrag_end)) { + &defrag_end, range->flags & + BTRFS_DEFRAG_RANGE_COMPRESS)) { unsigned long next; /* * the should_defrag function tells us how much to skip -- cgit v1.2.3 From b27f7c0c150f74564b5d4c6c24a03c5226bf6327 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 22 Jun 2012 06:30:39 -0600 Subject: btrfs: join DEV_STATS ioctls to one Commit c11d2c236cc260b36 (Btrfs: add ioctl to get and reset the device stats) introduced two ioctls doing almost the same thing distinguished by just the ioctl number which encodes "do reset after read". I have suggested http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg16604.html to implement it via the ioctl args. This hasn't happen, and I think we should use a more clean way to pass flags and should not waste ioctl numbers. CC: Stefan Behrens Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 ++++++++-------- fs/btrfs/ioctl.h | 6 ++++-- fs/btrfs/volumes.c | 5 ++--- fs/btrfs/volumes.h | 3 +-- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9ec23b93e01..3a3f916d7c0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3065,19 +3065,21 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, } static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, - void __user *arg, int reset_after_read) + void __user *arg) { struct btrfs_ioctl_get_dev_stats *sa; int ret; - if (reset_after_read && !capable(CAP_SYS_ADMIN)) - return -EPERM; - sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); - ret = btrfs_get_dev_stats(root, sa, reset_after_read); + if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { + kfree(sa); + return -EPERM; + } + + ret = btrfs_get_dev_stats(root, sa); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; @@ -3475,9 +3477,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root, argp); case BTRFS_IOC_GET_DEV_STATS: - return btrfs_ioctl_get_dev_stats(root, argp, 0); - case BTRFS_IOC_GET_AND_RESET_DEV_STATS: - return btrfs_ioctl_get_dev_stats(root, argp, 1); + return btrfs_ioctl_get_dev_stats(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e440aa653c3..021c55ed8ae 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -285,9 +285,13 @@ enum btrfs_dev_stat_values { BTRFS_DEV_STAT_VALUES_MAX }; +/* Reset statistics after reading; needs SYS_ADMIN capability */ +#define BTRFS_DEV_STATS_RESET (1ULL << 0) + struct btrfs_ioctl_get_dev_stats { __u64 devid; /* in */ __u64 nr_items; /* in/out */ + __u64 flags; /* in/out */ /* out values: */ __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; @@ -361,7 +365,5 @@ struct btrfs_ioctl_get_dev_stats { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) -#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ - struct btrfs_ioctl_get_dev_stats) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ecaad40e7ef..957bf393ab4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4890,8 +4890,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) } int btrfs_get_dev_stats(struct btrfs_root *root, - struct btrfs_ioctl_get_dev_stats *stats, - int reset_after_read) + struct btrfs_ioctl_get_dev_stats *stats) { struct btrfs_device *dev; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; @@ -4909,7 +4908,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, printk(KERN_WARNING "btrfs: get dev_stats failed, not yet valid\n"); return -ENODEV; - } else if (reset_after_read) { + } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (stats->nr_items > i) stats->values[i] = diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 95f6637614d..e404414a95a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -293,8 +293,7 @@ struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, void btrfs_dev_stat_print_on_error(struct btrfs_device *device); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, - struct btrfs_ioctl_get_dev_stats *stats, - int reset_after_read); + struct btrfs_ioctl_get_dev_stats *stats); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -- cgit v1.2.3 From 96c3f4331a8c1cd0a58307e4ac7e73e09d7dab23 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 21 Jun 2012 14:05:49 -0400 Subject: Btrfs: flush delayed inodes if we're short on space Those crazy gentoo guys have been complaining about ENOSPC errors on their portage volumes. This is because doing things like untar tends to create lots of new files which will soak up all the reservation space in the delayed inodes. Usually this gets papered over by the fact that we will try and commit the transaction, however if this happens in the wrong spot or we choose not to commit the transaction you will be screwed. So add the ability to expclitly flush delayed inodes to free up space. Please test this out guys to make sure it works since as usual I cannot reproduce. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/delayed-inode.c | 22 +++++++++-- fs/btrfs/delayed-inode.h | 2 + fs/btrfs/extent-tree.c | 97 +++++++++++++++++++++++++++++++----------------- 3 files changed, 83 insertions(+), 38 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 2399f408691..21d91a8073e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1113,8 +1113,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * Returns < 0 on error and returns with an aborted transaction with any * outstanding delayed items cleaned up. */ -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) { struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; @@ -1122,6 +1122,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; + bool count = (nr > 0); if (trans->aborted) return -EIO; @@ -1137,7 +1138,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node) { + while (curr_node && (!count || (count && nr--))) { curr_root = curr_node->root; ret = btrfs_insert_delayed_items(trans, path, curr_root, curr_node); @@ -1149,6 +1150,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path, curr_node); if (ret) { btrfs_release_delayed_node(curr_node); + curr_node = NULL; btrfs_abort_transaction(trans, root, ret); break; } @@ -1158,12 +1160,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, btrfs_release_delayed_node(prev_node); } + if (curr_node) + btrfs_release_delayed_node(curr_node); btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; } +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_run_delayed_items(trans, root, -1); +} + +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) +{ + return __btrfs_run_delayed_items(trans, root, nr); +} + static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f5aa4023d3e..4f808e1baee 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode); int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr); void btrfs_balance_delayed_items(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1d36702ff..3cde907a25a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3728,6 +3728,60 @@ commit: return btrfs_commit_transaction(trans, root); } +enum flush_state { + FLUSH_DELALLOC = 1, + FLUSH_DELALLOC_WAIT = 2, + FLUSH_DELAYED_ITEMS_NR = 3, + FLUSH_DELAYED_ITEMS = 4, + COMMIT_TRANS = 5, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret; + + switch (state) { + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + ret = shrink_delalloc(root, num_bytes, + state == FLUSH_DELALLOC_WAIT); + if (ret > 0) + ret = 0; + break; + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) { + u64 bytes = btrfs_calc_trans_metadata_size(root, 1); + + nr = (int)div64_u64(num_bytes, bytes); + if (!nr) + nr = 1; + nr *= 2; + } else { + nr = -1; + } + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; + } + + return ret; +} /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -3749,11 +3803,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int retries = 0; + int flush_state = FLUSH_DELALLOC; int ret = 0; - bool committed = false; bool flushing = false; - bool wait_ordered = false; + bool committed = false; again: ret = 0; @@ -3812,9 +3865,8 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - wait_ordered = true; num_bytes = used - space_info->total_bytes + - (orig_bytes * (retries + 1)); + (orig_bytes * 2); } if (ret) { @@ -3867,8 +3919,6 @@ again: trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else { - wait_ordered = true; } } @@ -3887,36 +3937,13 @@ again: if (!ret || !flush) goto out; - /* - * We do synchronous shrinking since we don't actually unreserve - * metadata until after the IO is completed. - */ - ret = shrink_delalloc(root, num_bytes, wait_ordered); - if (ret < 0) - goto out; - - ret = 0; - - /* - * So if we were overcommitted it's possible that somebody else flushed - * out enough space and we simply didn't have enough space to reclaim, - * so go back around and try again. - */ - if (retries < 2) { - wait_ordered = true; - retries++; + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; + if (!ret) goto again; - } - - ret = -ENOSPC; - if (committed) - goto out; - - ret = may_commit_transaction(root, space_info, orig_bytes, 0); - if (!ret) { - committed = true; + else if (flush_state <= COMMIT_TRANS) goto again; - } out: if (flushing) { -- cgit v1.2.3 From 02db0844beffc1c4e99d750be58ffb3ed95d6d62 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 21 Jun 2012 16:03:58 -0400 Subject: Btrfs: add DEVICE_READY ioctl This will be used in conjunction with btrfs device ready . This is needed for initrd's to have a nice and lightweight way to tell if all of the devices needed for a file system are in the cache currently. This keeps them from having to do mount+sleep loops waiting for devices to show up. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.h | 3 ++- fs/btrfs/super.c | 7 +++++++ fs/btrfs/volumes.c | 9 ++++++++- fs/btrfs/volumes.h | 1 + 4 files changed, 18 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 021c55ed8ae..4e3e5d342a2 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -363,7 +363,8 @@ struct btrfs_ioctl_get_dev_stats { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ + struct btrfs_ioctl_vol_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) - #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 88a2d2bb2d7..26da344231a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1462,6 +1462,13 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = btrfs_scan_one_device(vol->name, FMODE_READ, &btrfs_fs_type, &fs_devices); break; + case BTRFS_IOC_DEVICES_READY: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + if (ret) + break; + ret = !(fs_devices->num_devices == fs_devices->total_devices); + break; } kfree(vol); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 957bf393ab4..39a0d04759f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -429,6 +429,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; + fs_devices->total_devices = orig->total_devices; memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); /* We have held the volume lock, it is safe to get the devices. */ @@ -739,6 +740,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, int ret; u64 devid; u64 transid; + u64 total_devices; flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); @@ -760,6 +762,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); + total_devices = btrfs_super_num_devices(disk_super); if (disk_super->label[0]) printk(KERN_INFO "device label %s ", disk_super->label); else @@ -767,7 +770,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); - + if (!ret && fs_devices_ret) + (*fs_devices_ret)->total_devices = total_devices; brelse(bh); error_close: mutex_unlock(&uuid_mutex); @@ -1433,6 +1437,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) list_del_rcu(&device->dev_list); device->fs_devices->num_devices--; + device->fs_devices->total_devices--; if (device->missing) root->fs_info->fs_devices->missing_devices--; @@ -1550,6 +1555,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; + fs_devices->total_devices = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -1749,6 +1755,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_devices++; root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_devices++; if (device->can_discard) root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e404414a95a..5479325987b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -126,6 +126,7 @@ struct btrfs_fs_devices { u64 missing_devices; u64 total_rw_bytes; u64 num_can_discard; + u64 total_devices; struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex -- cgit v1.2.3 From fed425c742cb1262ce90a41f2d3d211bac099533 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 Jun 2012 12:13:01 -0600 Subject: Btrfs: do not return EINVAL instead of ENOMEM from open_ctree() When bailing from open_ctree() err is returned, not ret. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2936ca49b3b..fd216d9369f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2244,7 +2244,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); if (ret) { - ret = -ENOMEM; + err = -ENOMEM; goto fail_sb_buffer; } -- cgit v1.2.3 From 44c44af2f4a6dc1595f1711cf307bd01062fd129 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 Jun 2012 12:14:13 -0600 Subject: Btrfs: do not ignore errors from btrfs_cleanup_fs_roots() when mounting There used to be a BUG_ON(ret) there before EH patch (79787eaa) went in. Bail out with EINVAL. Cc: David Sterba Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fd216d9369f..dd6676b446f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2466,8 +2466,8 @@ retry_root_backup: if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) { - } + if (ret) + goto fail_trans_kthread; ret = btrfs_recover_relocation(tree_root); if (ret < 0) { -- cgit v1.2.3 From 23291a044c31f9dfdeaf633b631059fb75e5c2c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Jun 2012 05:15:23 -0600 Subject: Btrfs: fix error handling in __add_reloc_root() We dereferenced "node" in the error message after freeing it. Also btrfs_panic() can return so we should return an error code instead of continuing. Signed-off-by: Dan Carpenter --- fs/btrfs/relocation.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 646ee21bb03..c5dbd914967 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1239,10 +1239,11 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { - kfree(node); btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " "tree\n"); + kfree(node); + return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); -- cgit v1.2.3 From b9959295151625c17723103afd79077e80b24ddd Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 25 Jun 2012 21:25:22 -0600 Subject: Btrfs: return error of btrfs_update_inode() to caller We didn't check error of btrfs_update_inode(), but that error looks easy to bubble back up. Reviewed-by: David Sterba Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 +- fs/btrfs/tree-log.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fcc65802f36..f93a98e65d6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2833,7 +2833,7 @@ err: inode_inc_iversion(inode); inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; - btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, dir); out: return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8abeae4224f..c86670f4f28 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -637,7 +637,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } inode_set_bytes(inode, saved_nbytes); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); out: if (inode) iput(inode); @@ -1133,7 +1133,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { btrfs_inc_nlink(inode); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; } else { -- cgit v1.2.3 From 0e721106923be82f651dd0ee504742a8a3eb089f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Jun 2012 16:13:18 -0400 Subject: Btrfs: change how we indicate we're adding csums There is weird logic I had to put in place to make sure that when we were adding csums that we'd used the delalloc block rsv instead of the global block rsv. Part of this meant that we had to free up our transaction reservation before we ran the delayed refs since csum deletion happens during the delayed ref work. The problem with this is that when we release a reservation we will add it to the global reserve if it is not full in order to keep us going along longer before we have to force a transaction commit. By releasing our reservation before we run delayed refs we don't get the opportunity to drain down the global reserve for the work we did, so we won't refill it as often. This isn't a problem per-se, it just results in us possibly committing transactions more and more often, and in rare cases could cause those WARN_ON()'s to pop in use_block_rsv because we ran out of space in our block rsv. This also helps us by holding onto space while the delayed refs run so we don't end up with as many people trying to do things at the same time, which again will help us not force commits or hit the use_block_rsv warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 8 +++++++- fs/btrfs/file-item.c | 2 ++ fs/btrfs/transaction.c | 22 ++++++++-------------- fs/btrfs/transaction.h | 1 + 4 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3cde907a25a..ec0328bb86d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3961,7 +3961,10 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows || root == root->fs_info->csum_root) + if (root->ref_cows) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->csum_root && trans->adding_csums) block_rsv = trans->block_rsv; if (!block_rsv) @@ -4313,6 +4316,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + if (!trans->block_rsv) + return; + if (!trans->bytes_reserved) return; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5d158d32023..863c34d111b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -690,6 +690,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; + trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -853,6 +854,7 @@ next_sector: goto again; } out: + trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cb2dfe29394..328b95f6766 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -351,6 +351,7 @@ again: h->bytes_reserved = 0; h->delayed_ref_updates = 0; h->use_count = 1; + h->adding_csums = 0; h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; @@ -473,7 +474,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; int err; @@ -481,12 +481,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; - /* - * We need to do this in case we're deleting csums so the global block - * rsv get's used instead of the csum block rsv. - */ - trans->block_rsv = NULL; - updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { @@ -495,8 +489,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, return err; } - trans->block_rsv = rsv; - return should_end_transaction(trans, root); } @@ -513,8 +505,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } - btrfs_trans_release_metadata(trans, root); - trans->block_rsv = NULL; while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -527,6 +517,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { @@ -1269,9 +1261,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); - btrfs_trans_release_metadata(trans, root); - trans->block_rsv = NULL; - if (cur_trans->aborted) goto cleanup_transaction; @@ -1282,6 +1271,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + cur_trans = trans->transaction; /* @@ -1533,6 +1525,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; cleanup_transaction: + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); // WARN_ON(1); if (current->journal_info == trans) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fe27379e368..d314a74b496 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,6 +57,7 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; int aborted; + int adding_csums; }; struct btrfs_pending_snapshot { -- cgit v1.2.3 From 287082b0bd10060e9c6b32ed9605174ddf2f672a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 28 Jun 2012 04:02:24 -0600 Subject: Btrfs: fix typo in cow_file_range_async and async_cow_submit It should be 10 * 1024 * 1024. Signed-off-by: Liu Bo Signed-off-by: Jiri Kosina --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f93a98e65d6..18f1b44d161 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1010,7 +1010,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); if (atomic_read(&root->fs_info->async_delalloc_pages) < - 5 * 1042 * 1024 && + 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -1035,7 +1035,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1042; + int limit = 10 * 1024 * 1024; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); -- cgit v1.2.3 From a874a63e1389c1daabd5abe4e4faaf9d63daf474 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2012 03:58:46 -0600 Subject: Btrfs: check write access to mount earlier while creating snapshots Move check of write access to mount into upper functions so that we can use mnt_want_write_file instead, which is faster than mnt_want_write. Signed-off-by: Liu Bo --- fs/btrfs/ioctl.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3a3f916d7c0..c1f4975648e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -652,13 +652,9 @@ static noinline int btrfs_mksubvol(struct path *parent, if (dentry->d_inode) goto out_dput; - error = mnt_want_write(parent->mnt); - if (error) - goto out_dput; - error = btrfs_may_create(dir, dentry); if (error) - goto out_drop_write; + goto out_dput; down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); @@ -676,8 +672,6 @@ static noinline int btrfs_mksubvol(struct path *parent, fsnotify_mkdir(dir, dentry); out_up_read: up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); -out_drop_write: - mnt_drop_write(parent->mnt); out_dput: dput(dentry); out_unlock: @@ -1395,16 +1389,20 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + goto out; + namelen = strlen(name); if (strchr(name, '/')) { ret = -EINVAL; - goto out; + goto out_drop_write; } if (name[0] == '.' && (namelen == 1 || (name[1] == '.' && namelen == 2))) { ret = -EEXIST; - goto out; + goto out_drop_write; } if (subvol) { @@ -1415,7 +1413,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_file = fget(fd); if (!src_file) { ret = -EINVAL; - goto out; + goto out_drop_write; } src_inode = src_file->f_path.dentry->d_inode; @@ -1424,13 +1422,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, "another FS\n"); ret = -EINVAL; fput(src_file); - goto out; + goto out_drop_write; } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, transid, readonly); fput(src_file); } +out_drop_write: + mnt_drop_write_file(file); out: return ret; } -- cgit v1.2.3 From 768e9dfe820abdcfb6683e05c60b8634f1a4ffce Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2012 03:58:47 -0600 Subject: Btrfs: remove redundant r/o check for superblock mnt_want_write() and mnt_want_write_file() will check sb->s_flags with MS_RDONLY, and we don't need to do it ourselves. Signed-off-by: Liu Bo --- fs/btrfs/ioctl.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c1f4975648e..b8034dc62e3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1381,14 +1381,10 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, u64 *transid, bool readonly) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct file *src_file; int namelen; int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - ret = mnt_want_write_file(file); if (ret) goto out; @@ -3269,9 +3265,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; -- cgit v1.2.3 From e54bfa31044d602a57d4e190f6d1c3763ea76bfe Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2012 03:58:48 -0600 Subject: Btrfs: use mnt_want_write_file instead of mnt_want_write mnt_want_write_file is faster when file has been opened for write. Signed-off-by: Liu Bo --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b8034dc62e3..13ed1c9534c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3265,7 +3265,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3335,7 +3335,7 @@ out_bargs: out: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From b9ca0664dc806ba70587f6f3202b60dc736cd6e5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2012 03:58:49 -0600 Subject: Btrfs: do not set subvolume flags in readonly mode $ mkfs.btrfs /dev/sdb7 $ btrfstune -S1 /dev/sdb7 $ mount /dev/sdb7 /mnt/btrfs mount: block device /dev/sdb7 is write-protected, mounting read-only $ btrfs dev add /dev/sdb8 /mnt/btrfs/ Now we get a btrfs in which mnt flags has readonly but sb flags does not. So for those ioctls that only check sb flags with MS_RDONLY, it is going to be a problem. Setting subvolume flags is such an ioctl, we should use mnt_want_write_file() to check RO flags. Signed-off-by: Liu Bo --- fs/btrfs/ioctl.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 13ed1c9534c..17facea6a51 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1521,29 +1521,40 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + goto out; - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) - return -EINVAL; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out_drop_write; + } - if (copy_from_user(&flags, arg, sizeof(flags))) - return -EFAULT; + if (copy_from_user(&flags, arg, sizeof(flags))) { + ret = -EFAULT; + goto out_drop_write; + } - if (flags & BTRFS_SUBVOL_CREATE_ASYNC) - return -EINVAL; + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { + ret = -EINVAL; + goto out_drop_write; + } - if (flags & ~BTRFS_SUBVOL_RDONLY) - return -EOPNOTSUPP; + if (flags & ~BTRFS_SUBVOL_RDONLY) { + ret = -EOPNOTSUPP; + goto out_drop_write; + } - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto out_drop_write; + } down_write(&root->fs_info->subvol_sem); /* nothing to do */ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) - goto out; + goto out_drop_sem; root_flags = btrfs_root_flags(&root->root_item); if (flags & BTRFS_SUBVOL_RDONLY) @@ -1566,8 +1577,11 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); -out: +out_drop_sem: up_write(&root->fs_info->subvol_sem); +out_drop_write: + mnt_drop_write_file(file); +out: return ret; } -- cgit v1.2.3 From f4c738c2e7bc6d696b0d60155df7ea01684962b6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 2 Jul 2012 17:10:51 -0400 Subject: Btrfs: rework shrink_delalloc So shrink_delalloc has grown all sorts of cruft over the years thanks to many reworkings of how we track enospc. What happens now as we fill up the disk is we will loop for freaking ever hoping to reclaim a arbitrary amount of space of metadata, this was from when everybody flushed at the same time. Now we only have people flushing one at a time. So instead of trying to reclaim a huge amount of space, just try to flush a decent chunk of space, and stop looping as soon as we have enough free space to satisfy our reservation. This makes xfstests 224 go much faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 81 +++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 57 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ec0328bb86d..5e552f9cc5b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3586,89 +3586,58 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; - u64 reserved; + u64 delalloc_bytes; u64 max_reclaim; - u64 reclaimed = 0; long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; - unsigned long progress; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_may_use; - progress = space_info->reservation_progress; - - if (reserved == 0) - return 0; - - smp_mb(); - if (root->fs_info->delalloc_bytes == 0) { + delalloc_bytes = root->fs_info->delalloc_bytes; + if (delalloc_bytes == 0) { if (trans) - return 0; + return; btrfs_wait_ordered_extents(root, 0, 0); - return 0; + return; } - max_reclaim = min(reserved, to_reclaim); - nr_pages = max_t(unsigned long, nr_pages, - max_reclaim >> PAGE_CACHE_SHIFT); - while (loops < 1024) { - /* have the flusher threads jump in and do some IO */ - smp_mb(); - nr_pages = min_t(unsigned long, nr_pages, - root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_may_use) - reclaimed += reserved - space_info->bytes_may_use; - reserved = space_info->bytes_may_use; + if (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use + orig <= + space_info->total_bytes) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; - - if (reserved == 0 || reclaimed >= max_reclaim) - break; - - if (trans && trans->transaction->blocked) - return -EAGAIN; - if (wait_ordered && !trans) { btrfs_wait_ordered_extents(root, 0, 0); } else { - time_left = schedule_timeout_interruptible(1); - - /* We were interrupted, exit */ + time_left = schedule_timeout_killable(1); if (time_left) break; } - - /* we've kicked the IO a few times, if anything has been freed, - * exit. There is no sense in looping here for a long time - * when we really need to commit the transaction, or there are - * just too many writers without enough free space - */ - - if (loops > 3) { - smp_mb(); - if (progress != space_info->reservation_progress) - break; - } - + smp_mb(); + delalloc_bytes = root->fs_info->delalloc_bytes; } - - return reclaimed >= to_reclaim; } /** @@ -3742,15 +3711,13 @@ static int flush_space(struct btrfs_root *root, { struct btrfs_trans_handle *trans; int nr; - int ret; + int ret = 0; switch (state) { case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - ret = shrink_delalloc(root, num_bytes, - state == FLUSH_DELALLOC_WAIT); - if (ret > 0) - ret = 0; + shrink_delalloc(root, num_bytes, orig_bytes, + state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: -- cgit v1.2.3 From c0901581ad077004145c9ee80e843fba71c100b8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 10 Jul 2012 07:30:17 -0600 Subject: Btrfs: avoid I/O repair BUG() from btree_read_extent_buffer_pages() From btree_read_extent_buffer_pages(), currently repair_io_failure() can be called with mirror_num being zero when submit_one_bio() returned an error before. This used to cause a BUG_ON(!mirror_num) in repair_io_failure() and indeed this is not a case that needs the I/O repair code to rewrite disk blocks. This commit prevents calling repair_io_failure() in this case and thus avoids the BUG_ON() and malfunction. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dd6676b446f..1a4a2a97592 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -407,7 +407,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, break; } - if (failed && !ret) + if (failed && !ret && failed_mirror) repair_eb_io_failure(root, eb, failed_mirror); return ret; -- cgit v1.2.3 From 51a8cf9d2d97017d334f33f1b39067bd2f03bc49 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 10 Jul 2012 05:28:38 -0600 Subject: Btrfs: fix btrfs_is_free_space_inode to recognize btree inode For btree inode, its root is also 'tree root', so btree inode can be misunderstood as a free space inode. We should add one more check for btree inode. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 12394a90d60..b168238bcb1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -194,8 +194,10 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, struct inode *inode) { - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + if (root == root->fs_info->tree_root && + btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) + return true; + if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) return true; return false; } -- cgit v1.2.3 From 83eea1f1bacd5dc7b44dcf84f5fdca54fdea5453 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 10 Jul 2012 05:28:39 -0600 Subject: Btrfs: kill root from btrfs_is_free_space_inode Since root can be fetched via BTRFS_I macro directly, we can save an args for btrfs_is_free_space_inode(). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 5 +++-- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/file-item.c | 2 +- fs/btrfs/inode.c | 22 +++++++++++----------- 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b168238bcb1..21b8cfe08e9 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -191,9 +191,10 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } -static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, - struct inode *inode) +static inline bool btrfs_is_free_space_inode(struct inode *inode) { + struct btrfs_root *root = BTRFS_I(inode)->root; + if (root == root->fs_info->tree_root && btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e552f9cc5b..d1ebd2a0611 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4444,7 +4444,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) int ret; /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(inode)) flush = 0; if (flush && btrfs_transaction_in_commit(root->fs_info)) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 863c34d111b..b45b9de0c21 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -183,7 +183,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ - if (btrfs_is_free_space_inode(root, inode)) { + if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 18f1b44d161..321c415dea7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -825,7 +825,7 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - BUG_ON(btrfs_is_free_space_inode(root, inode)); + BUG_ON(btrfs_is_free_space_inode(inode)); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { extent_clear_unlock_delalloc(inode, @@ -1153,7 +1153,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, return -ENOMEM; } - nolock = btrfs_is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(inode); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -1466,7 +1466,7 @@ static void btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !btrfs_is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1501,7 +1501,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !btrfs_is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1612,7 +1612,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (btrfs_is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(inode)) metadata = 2; if (!(rw & REQ_WRITE)) { @@ -1869,7 +1869,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) int ret; bool nolock; - nolock = btrfs_is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(inode); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; @@ -2007,7 +2007,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, ordered_extent->work.func = finish_ordered_fn; ordered_extent->work.flags = 0; - if (btrfs_is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(inode)) workers = &root->fs_info->endio_freespace_worker; else workers = &root->fs_info->endio_write_workers; @@ -2732,7 +2732,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!btrfs_is_free_space_inode(root, inode) + if (!btrfs_is_free_space_inode(inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) @@ -3743,7 +3743,7 @@ void btrfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - btrfs_is_free_space_inode(root, inode))) + btrfs_is_free_space_inode(inode))) goto no_delete; if (is_bad_inode(inode)) { @@ -4457,7 +4457,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; - if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -7051,7 +7051,7 @@ int btrfs_drop_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0 && - !btrfs_is_free_space_inode(root, inode)) + !btrfs_is_free_space_inode(inode)) return 1; else return generic_drop_inode(inode); -- cgit v1.2.3 From 067893842341e7b7487062367ecfaa46c97505e0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:33 -0600 Subject: Btrfs: do not abort transaction in prealloc case During disk balance, we prealloc new file extent for file data relocation, but we may fail in 'no available space' case, and it leads to flipping btrfs into readonly. It is not necessary to bail out and abort transaction since we do have several ways to rescue ourselves from ENOSPC case. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d1ebd2a0611..67bd12a5236 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5748,7 +5748,11 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_LIMITED); - if (ret < 0) { + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); goto out; -- cgit v1.2.3 From cf7c1ef6e1fe05864369f59dd516e816b11de7d0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:34 -0600 Subject: Btrfs: fix a bug of writting free space cache during balance Here is the whole story: 1) A free space cache consists of two parts: o free space cache inode, which is special becase it's stored in root tree. o free space info, which is stored as the above inode's file data. But we only build up another new inode and does not flush its free space info onto disk when we _clear and setup_ free space cache, and this ends up with that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN. And holding DC_SETUP means that we will not truncate this free space cache inode, which means the disk offset of its file extent will remain _unchanged_ at least until next transaction finishes committing itself. 2) We can set a block group readonly when we relocate the block group. However, if the readonly block group covers the disk offset where our free space cache inode is going to write, it will force the free space cache inode into cow_file_range() and it'll end up hitting a BUG_ON. 3) Due to the above analysis, we fix this bug by adding the missing dirty flag. 4) However, it's not over, there is still another case, nospace_cache. With nospace_cache, we do not want to set dirty flag, instead we just truncate free space cache inode and bail out with setting cache state DC_WRITTEN. We can benifit from it since it saves us another 'pre-allocation' part which usually costs a lot. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 67bd12a5236..3ca26d84cce 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2903,8 +2903,13 @@ again: } spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED) { - /* We're not cached, don't bother trying to write stuff out */ + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; @@ -7614,8 +7619,21 @@ int btrfs_read_block_groups(struct btrfs_root *root) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - if (need_clear) + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ cache->disk_cache_state = BTRFS_DC_CLEAR; + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->dirty = 1; + } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), -- cgit v1.2.3 From 799ffc3c31de57d10a4b9abcfbfeea8771acc976 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:35 -0600 Subject: Btrfs: add ro notification to dump_space_info Block group has ro attributes, make dump_space_info show it. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ca26d84cce..7843542484c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5825,13 +5825,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", + printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", (unsigned long long)cache->key.objectid, (unsigned long long)cache->key.offset, (unsigned long long)btrfs_block_group_used(&cache->item), (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); + (unsigned long long)cache->reserved, + cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } -- cgit v1.2.3 From f6175efab1e024554a104cca1f86134ef7ce06bc Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2012 03:31:36 -0600 Subject: Btrfs: do not count in readonly bytes If a block group is ro, do not count its entries in when we dump space info. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6c4e2baa929..6b10acfc2f5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1968,7 +1968,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); - if (info->bytes >= bytes) + if (info->bytes >= bytes && !block_group->ro) count++; printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", (unsigned long long)info->offset, -- cgit v1.2.3 From e6466e354a5c23717325adecf387f93be4b9c830 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 4 Jul 2012 08:15:02 -0600 Subject: Btrfs: fix buffer leak in btrfs_next_old_leaf When calling btrfs_next_old_leaf, we were leaking an extent buffer in the rare case of using the deadlock avoidance code needed for the tree mod log. Signed-off-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8206b390058..67fe46fdee6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5127,6 +5127,7 @@ again: * locked. To solve this situation, we give up * on our lock and cycle. */ + free_extent_buffer(next); btrfs_release_path(path); cond_resched(); goto again; -- cgit v1.2.3 From d5b025d510664382f9a197f7e7fb9fc60fe209bc Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 2 Jul 2012 22:05:21 -0600 Subject: btrfs read error corrected message floods the console during recovery Changing printk_in_rcu to printk_ratelimited_in_rcu will suffice Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 01c21b6c6d4..f08206fcfb2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1918,7 +1918,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return -EIO; } - printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " + printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " "(dev %s sector %llu)\n", page->mapping->host->i_ino, start, rcu_str_deref(dev->name), sector); -- cgit v1.2.3 From b4d7c3c9456a311a45bc1ef8944b5ba5b176244f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jul 2012 20:21:07 -0600 Subject: Btrfs: kill free_space pointer from inode structure Inodes always allocate free space with BTRFS_BLOCK_GROUP_DATA type, which means every inode has the same BTRFS_I(inode)->free_space pointer. This shrinks struct btrfs_inode by 4 bytes (or 8 bytes on 64 bits). Signed-off-by: Li Zefan --- fs/btrfs/btrfs_inode.h | 3 --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 20 ++++++++------------ fs/btrfs/inode.c | 3 --- 4 files changed, 10 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 21b8cfe08e9..5b2ad6bc4fe 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -87,9 +87,6 @@ struct btrfs_inode { /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; - /* the space_info for where this inode's data allocations are done */ - struct btrfs_space_info *space_info; - unsigned long runtime_flags; /* full 64 bit generation number, struct vfs_inode doesn't have a big diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fa5c45b3907..6761490b91c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1240,6 +1240,8 @@ struct btrfs_fs_info { */ struct list_head space_info; + struct btrfs_space_info *data_sinfo; + struct reloc_control *reloc_ctl; spinlock_t delalloc_lock; @@ -2607,7 +2609,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7843542484c..6621ed72f3c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3139,6 +3139,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; return 0; } @@ -3268,12 +3270,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return get_alloc_profile(root, flags); } -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - BTRFS_BLOCK_GROUP_DATA); -} - /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. @@ -3282,6 +3278,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0, committed = 0, alloc_chunk = 1; @@ -3294,7 +3291,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) committed = 1; } - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = fs_info->data_sinfo; if (!data_sinfo) goto alloc; @@ -3335,10 +3332,9 @@ alloc: goto commit_trans; } - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + goto again; } @@ -3385,7 +3381,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 321c415dea7..ee45ebf4219 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4082,7 +4082,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) struct btrfs_iget_args *args = p; inode->i_ino = args->ino; BTRFS_I(inode)->root = args->root; - btrfs_set_inode_space_info(args->root, inode); return 0; } @@ -4667,7 +4666,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; - btrfs_set_inode_space_info(root, inode); if (S_ISDIR(mode)) owner = 0; @@ -6944,7 +6942,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) return NULL; ei->root = NULL; - ei->space_info = NULL; ei->generation = 0; ei->last_trans = 0; ei->last_sub_trans = 0; -- cgit v1.2.3 From 293f7e07405a63975cee4e95a2cfa0c17b34b3aa Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 10 Jul 2012 00:58:58 -0600 Subject: Btrfs: zero unused bytes in inode item The otime field is not zeroed, so users will see random otime in an old filesystem with a new kernel which has otime support in the future. The reserved bytes are also not zeroed, and we'll have compatibility issue if we make use of those bytes. Signed-off-by: Li Zefan --- fs/btrfs/delayed-inode.c | 1 + fs/btrfs/inode.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 21d91a8073e..335605c8cea 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); delayed_node->bytes_reserved = 0; + memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); } static inline int btrfs_is_continuous_delayed_item( diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee45ebf4219..144f4642b2a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4693,6 +4693,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, + sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, -- cgit v1.2.3 From 18077bb413687f96bd168efcfb2b8778529e3b74 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jul 2012 20:22:35 -0600 Subject: Btrfs: rewrite BTRFS_SETGET_FUNCS BTRFS_SETGET_FUNCS macro is used to generate btrfs_set_foo() and btrfs_foo() functions, which read and write specific fields in the extent buffer. The total number of set/get functions is ~200, but in fact we only need 8 functions: 2 for u8 field, 2 for u16, 2 for u32 and 2 for u64. It results in redunction of ~37K bytes. text data bss dec hex filename 629661 12489 216 642366 9cd3e fs/btrfs/btrfs.o.orig 592637 12489 216 605342 93c9e fs/btrfs/btrfs.o Signed-off-by: Li Zefan --- fs/btrfs/ctree.h | 53 +++++++++++-- fs/btrfs/struct-funcs.c | 196 ++++++++++++++++++++++++------------------------ 2 files changed, 146 insertions(+), 103 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6761490b91c..a0ee2f8e056 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1623,13 +1623,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) offsetof(type, member), \ sizeof(((type *)0)->member))) -#ifndef BTRFS_SETGET_FUNCS +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token); \ +void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val, \ + struct btrfs_map_token *token); \ +static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off) \ +{ \ + return btrfs_get_token_##bits(eb, ptr, off, NULL); \ +} \ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ +} + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); -#endif +static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \ +} \ +static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \ +} #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c6ffa581241..b976597b072 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -17,15 +17,27 @@ */ #include +#include -/* this is some deeply nasty code. ctree.h has a different - * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef +#include "ctree.h" + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +/* + * this is some deeply nasty code. * * The end result is that anyone who #includes ctree.h gets a - * declaration for the btrfs_set_foo functions and btrfs_foo functions - * - * This file declares the macros and then #includes ctree.h, which results - * in cpp creating the function here based on the template below. + * declaration for the btrfs_set_foo functions and btrfs_foo functions, + * which are wappers of btrfs_set_token_#bits functions and + * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping * required to efficiently read and write specific fields in the extent @@ -33,103 +45,93 @@ * an unsigned long offset into the extent buffer which has been * cast to a specific type. This gives us all the gcc type checking. * - * The extent buffer api is used to do all the kmapping and page - * spanning work required to get extent buffers in highmem and have - * a metadata blocksize different from the page size. - * - * The macro starts with a simple function prototype declaration so that - * sparse won't complain about it being static. + * The extent buffer api is used to do the page spanning work required to + * have a metadata blocksize different from the page size. */ -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, \ - type *s, struct btrfs_map_token *token) \ +#define DEFINE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token) \ { \ - unsigned long part_offset = (unsigned long)s; \ - unsigned long offset = part_offset + offsetof(type, member); \ - type *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - unsigned long mem_len = sizeof(((type *)0)->member); \ - u##bits res; \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ - kaddr = token->kaddr; \ - p = (type *)(kaddr + part_offset - token->offset); \ - res = le##bits##_to_cpu(p->member); \ - return res; \ - } \ - err = map_private_extent_buffer(eb, offset, \ - mem_len, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits leres; \ - read_eb_member(eb, s, type, member, &leres); \ - return le##bits##_to_cpu(leres); \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - res = le##bits##_to_cpu(p->member); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ - } \ - return res; \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + u##bits res; \ + \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + kaddr = token->kaddr; \ + p = kaddr + part_offset - token->offset; \ + res = get_unaligned_le##bits(p + off); \ + return res; \ + } \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + \ + read_extent_buffer(eb, &leres, offset, size); \ + return le##bits##_to_cpu(leres); \ + } \ + p = kaddr + part_offset - map_start; \ + res = get_unaligned_le##bits(p + off); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ + return res; \ } \ -void btrfs_set_token_##name(struct extent_buffer *eb, \ - type *s, u##bits val, struct btrfs_map_token *token) \ +void btrfs_set_token_##bits(struct extent_buffer *eb, \ + void *ptr, unsigned long off, u##bits val, \ + struct btrfs_map_token *token) \ { \ - unsigned long part_offset = (unsigned long)s; \ - unsigned long offset = part_offset + offsetof(type, member); \ - type *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - unsigned long mem_len = sizeof(((type *)0)->member); \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ - kaddr = token->kaddr; \ - p = (type *)(kaddr + part_offset - token->offset); \ - p->member = cpu_to_le##bits(val); \ - return; \ - } \ - err = map_private_extent_buffer(eb, offset, \ - mem_len, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits val2; \ - val2 = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val2); \ - return; \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - p->member = cpu_to_le##bits(val); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ - } \ -} \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ -{ \ - btrfs_set_token_##name(eb, s, val, NULL); \ -} \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ -{ \ - return btrfs_token_##name(eb, s, NULL); \ -} \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + kaddr = token->kaddr; \ + p = kaddr + part_offset - token->offset; \ + put_unaligned_le##bits(val, p + off); \ + return; \ + } \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + \ + val2 = cpu_to_le##bits(val); \ + write_extent_buffer(eb, &val2, offset, size); \ + return; \ + } \ + p = kaddr + part_offset - map_start; \ + put_unaligned_le##bits(val, p + off); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ +} -#include "ctree.h" +DEFINE_BTRFS_SETGET_BITS(8) +DEFINE_BTRFS_SETGET_BITS(16) +DEFINE_BTRFS_SETGET_BITS(32) +DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) -- cgit v1.2.3 From 5021976d8dd6d94248026631bfa4578aacd7b563 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 17 Jul 2012 09:02:10 -0600 Subject: Btrfs: remove unwanted printk() for btrfs device I/O stats People complained about the annoying kernel log message "btrfs: no dev_stats entry found ... (OK on first mount after mkfs)" everytime a filesystem is mounted for the first time after running mkfs. Since the distribution of the btrfs-progs is not synchronized to the kernel version, mkfs like it is now will be used also in the future. Then this message is not useful to find errors, it is just annoying. This commit removes the printk(). Signed-off-by: Stefan Behrens --- fs/btrfs/volumes.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 39a0d04759f..14436074350 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4743,9 +4743,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", - rcu_str_deref(device->name), - (unsigned long long)device->devid); __btrfs_reset_dev_stats(device); device->dev_stats_valid = 1; btrfs_release_path(path); -- cgit v1.2.3 From a98cdb85b990765dbe80a215367ae007320bfeea Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 17 Jul 2012 09:02:11 -0600 Subject: Btrfs: suppress printk() if all device I/O stats are zero Code is added to suppress the I/O stats printing at mount time if all statistic values are zero. Signed-off-by: Stefan Behrens --- fs/btrfs/volumes.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 14436074350..b8708f994e6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4884,6 +4884,14 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) { + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (btrfs_dev_stat_read(dev, i) != 0) + break; + if (i == BTRFS_DEV_STAT_VALUES_MAX) + return; /* all values == 0, suppress message */ + printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), -- cgit v1.2.3 From e64860aa05048fa7a8483ca698b17c2caf5625cf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 20 Jul 2012 16:05:36 -0400 Subject: Btrfs: don't return true in releasepage unless we actually freed the eb I noticed while looking at an extent_buffer race that we will unconditionally return 1 if we get down to release_extent_buffer after clearing the tree ref. However we can easily race in here and get a ref on the eb and not actually free the eb. So make release_extent_buffer return 1 if it free'd the eb and 0 if not so we can be a little kinder to the vm. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f08206fcfb2..e6243f78743 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4300,7 +4300,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) } /* Expects to have eb->eb_lock already held */ -static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { @@ -4321,9 +4321,11 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) btrfs_release_extent_buffer_page(eb, 0); call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return; + return 1; } spin_unlock(&eb->refs_lock); + + return 0; } void free_extent_buffer(struct extent_buffer *eb) @@ -4962,7 +4964,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask) spin_unlock(&eb->refs_lock); return 0; } - release_extent_buffer(eb, mask); - return 1; + return release_extent_buffer(eb, mask); } -- cgit v1.2.3 From 594831c4b232b094d645503ecedec2e35dcebdf3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 20 Jul 2012 16:11:08 -0400 Subject: Btrfs: fix potential race in extent buffer freeing This sounds sort of impossible but it is the only thing I can think of and at the very least it is theoretically possible so here it goes. If we are in try_release_extent_buffer we will check that the ref count on the extent buffer is 1 and not under IO, and then go down and clear the tree ref. If between this check and clearing the tree ref somebody else comes in and grabs a ref on the eb and the marks it dirty before try_release_extent_buffer() does it's tree ref clear we can end up with a dirty eb that will be freed while it is still dirty which will result in a panic. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e6243f78743..e1939a6c747 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4123,11 +4123,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * So bump the ref count first, then set the bit. If someone * beat us to it, drop the ref we added. */ - if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_lock(&eb->refs_lock); + if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_inc(&eb->refs); - if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - } + spin_unlock(&eb->refs_lock); } static void mark_extent_buffer_accessed(struct extent_buffer *eb) @@ -4239,9 +4238,7 @@ again: goto free_eb; } /* add one reference for the tree */ - spin_lock(&eb->refs_lock); check_buffer_tree_ref(eb); - spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); -- cgit v1.2.3 From 51561ffec9614618f3da362f9d1b03a95b717484 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 20 Jul 2012 16:25:24 -0400 Subject: Btrfs: lock the transition from dirty to writeback for an eb There is a small window where an eb can have no IO bits set on it, which could potentially result in extent_buffer_under_io() returning false when we want it to return true, which could result in not fun things happening. So in order to protect this case we need to hold the refs_lock when we make this transition to make sure we get reliable results out of extent_buffer_udner_io(). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e1939a6c747..97efc2f2259 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3077,8 +3077,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, } } + /* + * We need to do this to prevent races in people who check if the eb is + * under IO since we can end up having no IO bits set for a short period + * of time. + */ + spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); spin_lock(&fs_info->delalloc_lock); if (fs_info->dirty_metadata_bytes >= eb->len) @@ -3087,6 +3094,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, WARN_ON(1); spin_unlock(&fs_info->delalloc_lock); ret = 1; + } else { + spin_unlock(&eb->refs_lock); } btrfs_tree_unlock(eb); -- cgit v1.2.3 From df57dbe6bf73cc44305d81c24982a11da49b1f79 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 23 Jul 2012 05:50:03 -0600 Subject: Btrfs: make btrfs's allocation smoothly with preallocation For backref walking, we've introduce delayed ref's sequence. However, it changes our preallocation behavior. The story is that when we preallocate an extent and then mark it written piece by piece, the ideal case should be that we don't need to COW the extent, which is why we use 'preallocate'. But we may not make use of preallocation, since when we check for cross refs on the extent, we may have two ref entries which have the same content except the sequence value, and we recognize them as cross refs and do COW to allocate another extent. So we end up with several pieces of space instead of an whole extent. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6621ed72f3c..71b2d1c7da6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2581,8 +2581,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, node = rb_prev(node); if (node) { + int seq = ref->seq; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) + if (ref->bytenr == bytenr && ref->seq == seq) goto out_unlock; } -- cgit v1.2.3 From 67c9684f48ea9cbc5e9b8a1feb3151800e9dcc22 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 20 Jul 2012 21:43:09 -0600 Subject: Btrfs: improve multi-thread buffer read While testing with my buffer read fio jobs[1], I find that btrfs does not perform well enough. Here is a scenario in fio jobs: We have 4 threads, "t1 t2 t3 t4", starting to buffer read a same file, and all of them will race on add_to_page_cache_lru(), and if one thread successfully puts its page into the page cache, it takes the responsibility to read the page's data. And what's more, reading a page needs a period of time to finish, in which other threads can slide in and process rest pages: t1 t2 t3 t4 add Page1 read Page1 add Page2 | read Page2 add Page3 | | read Page3 add Page4 | | | read Page4 -----|------------|-----------|-----------|-------- v v v v bio bio bio bio Now we have four bios, each of which holds only one page since we need to maintain consecutive pages in bio. Thus, we can end up with far more bios than we need. Here we're going to a) delay the real read-page section and b) try to put more pages into page cache. With that said, we can make each bio hold more pages and reduce the number of bios we need. Here is some numbers taken from fio results: w/o patch w patch ------------- -------- --------------- READ: 745MB/s +25% 934MB/s [1]: [global] group_reporting thread numjobs=4 bs=32k rw=read ioengine=sync directory=/mnt/btrfs/ [READ] filename=foobar size=2000M invalidate=1 Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 97efc2f2259..3e7c9ed6505 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3566,19 +3566,38 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; unsigned long bio_flags = 0; + struct page *pagepool[16]; + struct page *page; + int i = 0; + int nr = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); + page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { - __extent_read_full_page(tree, page, get_extent, - &bio, 0, &bio_flags); + page_cache_release(page); + continue; } - page_cache_release(page); + + pagepool[nr++] = page; + if (nr < ARRAY_SIZE(pagepool)) + continue; + for (i = 0; i < nr; i++) { + __extent_read_full_page(tree, pagepool[i], get_extent, + &bio, 0, &bio_flags); + page_cache_release(pagepool[i]); + } + nr = 0; } + for (i = 0; i < nr; i++) { + __extent_read_full_page(tree, pagepool[i], get_extent, + &bio, 0, &bio_flags); + page_cache_release(pagepool[i]); + } + BUG_ON(!list_empty(pages)); if (bio) return submit_one_bio(READ, bio, 0, bio_flags); -- cgit v1.2.3 From 362a20c5e27614739c46707d1c5f55c214d164ce Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 1 Aug 2011 18:11:57 +0200 Subject: btrfs: allow cross-subvolume file clone Lift the EXDEV condition and allow different root trees for files being cloned, then pass source inode's root when searching for extents. Cloning is not allowed to cross vfsmounts, ie. when two subvolumes from one filesystem are mounted separately. Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e92e576300..7011871c45b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2340,6 +2340,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_drop_write; } + ret = -EXDEV; + if (src_file->f_path.mnt != file->f_path.mnt) + goto out_fput; + src = src_file->f_dentry->d_inode; ret = -EINVAL; @@ -2360,7 +2364,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_fput; ret = -EXDEV; - if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + if (src->i_sb != inode->i_sb) goto out_fput; ret = -ENOMEM; @@ -2434,13 +2438,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * note the key will change type as we walk through the * tree. */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, + 0, 0); if (ret < 0) goto out; nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) goto out; if (ret > 0) -- cgit v1.2.3 From e679376911d016b670c8cfc1645c178f77e8d1d3 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 11:18:10 +0200 Subject: Btrfs: add helper for tree enumeration Often no exact match is wanted but just the next lower or higher item. There's a lot of duplicated code throughout btrfs to deal with the corner cases. This patch adds a helper function that can facilitate searching. Signed-off-by: Arne Jansen --- fs/btrfs/ctree.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 3 +++ 2 files changed, 77 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8206b390058..c82a9e4a953 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2721,6 +2721,80 @@ done: return ret; } +/* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret < 0) + return ret; + if (!ret) { + p->slots[0] = btrfs_header_nritems(leaf) - 1; + return 0; + } + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } else { + --p->slots[0]; + } + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fa5c45b3907..8cfde9326dd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2711,6 +2711,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, -- cgit v1.2.3 From 2b0ce2c2909368d124a78a88e5c7106fdcba6221 Mon Sep 17 00:00:00 2001 From: Mitch Harder Date: Tue, 24 Jul 2012 11:58:43 -0600 Subject: Btrfs: Check INCOMPAT flags on remount and add helper function In support of the recently added capability to remount with lzo compression, provide a helper function to check the compression INCOMPAT flags when remounting with lzo compression, and set the flags if necessary. Also, implement the new helper function when defragmenting with explicit lzo compression and when setting the default subvolume. Signed-off-by: Mitch Harder Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 17 +++++++++++++++++ fs/btrfs/ioctl.c | 16 ++-------------- fs/btrfs/super.c | 1 + 3 files changed, 20 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 00f9a50f986..0f369da5cd9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3288,6 +3288,23 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + } +} + #define btrfs_abort_transaction(trans, root, errno) \ do { \ __btrfs_abort_transaction(trans, root, __func__, \ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e54b663fd3a..3f3cbe928a1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1053,11 +1053,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, u64 newer_than, unsigned long max_to_defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_super_block *disk_super; struct file_ra_state *ra = NULL; unsigned long last_index; u64 isize = i_size_read(inode); - u64 features; u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; @@ -1244,11 +1242,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = root->fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - btrfs_set_super_incompat_flags(disk_super, features); + btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO); } ret = defrag_count; @@ -2784,8 +2779,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_path *path; struct btrfs_key location; struct btrfs_disk_key disk_key; - struct btrfs_super_block *disk_super; - u64 features; u64 objectid = 0; u64 dir_id; @@ -2836,12 +2829,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = root->fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { - features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; - btrfs_set_super_incompat_flags(disk_super, features); - } + btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans, root); return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 26da344231a..75ee2c7791f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -401,6 +401,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; info->compress_type = BTRFS_COMPRESS_NONE; -- cgit v1.2.3 From e9fbcb42201c862fd6ab45c48ead4f47bb2dea9d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jul 2012 15:57:13 -0400 Subject: Btrfs: call the ordered free operation without any locks held Each ordered operation has a free callback, and this was called with the worker spinlock held. Josef made the free callback also call iput, which we can't do with the spinlock. This drops the spinlock for the free operation and grabs it again before moving through the rest of the list. We'll circle back around to this and find a cleaner way that doesn't bounce the lock around so much. Signed-off-by: Chris Mason cc: stable@kernel.org --- fs/btrfs/async-thread.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 42704149b72..58b7d14b08e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers, work->ordered_func(work); - /* now take the lock again and call the freeing code */ + /* now take the lock again and drop our item from the list */ spin_lock(&workers->order_lock); list_del(&work->order_list); + spin_unlock(&workers->order_lock); + + /* + * we don't want to call the ordered free functions + * with the lock held though + */ work->ordered_free(work); + spin_lock(&workers->order_lock); } spin_unlock(&workers->order_lock); -- cgit v1.2.3 From cd1cfc49153ba2bef247e500d8bd4d135193ece9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jul 2012 16:03:32 -0400 Subject: Btrfs: add a barrier before a waitqueue_active check We were missing wakeups on the delayed ref waitqueue due to races on waitqueue_active. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 44f06201f37..4e1b153b7c4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5294,6 +5294,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; + smp_mb(); if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) wake_up(&root->fs_info->tree_mod_seq_wait); -- cgit v1.2.3 From 91cb916ca26feb99c78c131a1643af3d10fefd96 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sun, 3 Jun 2012 14:23:23 +0200 Subject: Btrfs: make iref_to_path non static Make iref_to_path non static (needed in send) and rename it to btrfs_iref_to_path Signed-off-by: Alexander Block --- fs/btrfs/backref.c | 10 +++++----- fs/btrfs/backref.h | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a383c18e74e..e99fe0e31da 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1125,10 +1125,10 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, * required for the path to fit into the buffer. in that case, the returned * value will be smaller than dest. callers must check this! */ -static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) +char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) { u32 len; int slot; @@ -1543,7 +1543,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, + fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, inum, fspath_min, bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index c18d8ac7b79..1a765792fbf 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -21,6 +21,7 @@ #include "ioctl.h" #include "ulist.h" +#include "extent_io.h" #define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) @@ -60,6 +61,9 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 delayed_ref_seq, u64 time_seq, struct ulist **roots); +char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, struct extent_buffer *eb, + u64 parent, char *dest, u32 size); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, -- cgit v1.2.3 From 8ea05e3a4262b9e6871c349fa3486bcfc72ffd1a Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 25 Jul 2012 17:35:53 +0200 Subject: Btrfs: introduce subvol uuids and times This patch introduces uuids for subvolumes. Each subvolume has it's own uuid. In case it was snapshotted, it also contains parent_uuid. In case it was received, it also contains received_uuid. It also introduces subvolume ctime/otime/stime/rtime. The first two are comparable to the times found in inodes. otime is the origin/creation time and ctime is the change time. stime/rtime are only valid on received subvolumes. stime is the time of the subvolume when it was sent. rtime is the time of the subvolume when it was received. Additionally to the times, we have a transid for each time. They are updated at the same place as the times. btrfs receive uses stransid and rtransid to find out if a received subvolume changed in the meantime. If an older kernel mounts a filesystem with the extented fields, all fields become invalid. The next mount with a new kernel will detect this and reset the fields. Signed-off-by: Alexander Block Reviewed-by: David Sterba Reviewed-by: Arne Jansen Reviewed-by: Jan Schmidt Reviewed-by: Alex Lyakas --- fs/btrfs/check-integrity.c | 7 +-- fs/btrfs/ctree.h | 47 ++++++++++++++++++++ fs/btrfs/disk-io.c | 8 ++-- fs/btrfs/inode.c | 4 ++ fs/btrfs/ioctl.c | 100 ++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/ioctl.h | 17 +++++++ fs/btrfs/root-tree.c | 107 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/transaction.c | 17 +++++++ 8 files changed, 292 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index da6e9364a5e..9197e2e3340 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1032,6 +1032,7 @@ continue_with_current_leaf_stack_frame: struct btrfs_disk_key *disk_key; u8 type; u32 item_offset; + u32 item_size; if (disk_item_offset + sizeof(struct btrfs_item) > sf->block_ctx->len) { @@ -1047,6 +1048,7 @@ leaf_item_out_of_bounce_error: disk_item_offset, sizeof(struct btrfs_item)); item_offset = le32_to_cpu(disk_item.offset); + item_size = le32_to_cpu(disk_item.size); disk_key = &disk_item.key; type = disk_key->type; @@ -1057,14 +1059,13 @@ leaf_item_out_of_bounce_error: root_item_offset = item_offset + offsetof(struct btrfs_leaf, items); - if (root_item_offset + - sizeof(struct btrfs_root_item) > + if (root_item_offset + item_size > sf->block_ctx->len) goto leaf_item_out_of_bounce_error; btrfsic_read_from_block_data( sf->block_ctx, &root_item, root_item_offset, - sizeof(struct btrfs_root_item)); + item_size); next_bytenr = le64_to_cpu(root_item.bytenr); sf->error = diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8cfde9326dd..d5f6d745867 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -709,6 +709,36 @@ struct btrfs_root_item { struct btrfs_disk_key drop_progress; u8 drop_level; u8 level; + + /* + * The following fields appear after subvol_uuids+subvol_times + * were introduced. + */ + + /* + * This generation number is used to test if the new fields are valid + * and up to date while reading the root item. Everytime the root item + * is written out, the "generation" field is copied into this field. If + * anyone ever mounted the fs with an older kernel, we will have + * mismatching generation values here and thus must invalidate the + * new fields. See btrfs_update_root and btrfs_find_last_root for + * details. + * the offset of generation_v2 is also used as the start for the memset + * when invalidating the fields. + */ + __le64 generation_v2; + u8 uuid[BTRFS_UUID_SIZE]; + u8 parent_uuid[BTRFS_UUID_SIZE]; + u8 received_uuid[BTRFS_UUID_SIZE]; + __le64 ctransid; /* updated when an inode changes */ + __le64 otransid; /* trans when created */ + __le64 stransid; /* trans when sent. non-zero for received subvol */ + __le64 rtransid; /* trans when received. non-zero for received subvol */ + struct btrfs_timespec ctime; + struct btrfs_timespec otime; + struct btrfs_timespec stime; + struct btrfs_timespec rtime; + __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); /* @@ -1416,6 +1446,8 @@ struct btrfs_root { dev_t anon_dev; int force_cow; + + spinlock_t root_times_lock; }; struct btrfs_ioctl_defrag_range_args { @@ -2189,6 +2221,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, + ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, + otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, + stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, + rtransid, 64); static inline bool btrfs_root_readonly(struct btrfs_root *root) { @@ -2822,6 +2864,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); +void btrfs_read_root_item(struct btrfs_root *root, + struct extent_buffer *eb, int slot, + struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); @@ -2829,6 +2874,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root); /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2936ca49b3b..c39eb71fae3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1182,6 +1182,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; + + spin_lock_init(&root->root_times_lock); } static int __must_check find_and_setup_root(struct btrfs_root *tree_root, @@ -1326,6 +1328,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, u64 generation; u32 blocksize; int ret = 0; + int slot; root = btrfs_alloc_root(fs_info); if (!root) @@ -1352,9 +1355,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); if (ret == 0) { l = path->nodes[0]; - read_extent_buffer(l, &root->root_item, - btrfs_item_ptr_offset(l, path->slots[0]), - sizeof(root->root_item)); + slot = path->slots[0]; + btrfs_read_root_item(tree_root, l, slot, &root->root_item); memcpy(&root->root_key, location, sizeof(*location)); } btrfs_free_path(path); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7d1921ac76..4ffc8738954 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2734,6 +2734,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, */ if (!btrfs_is_free_space_inode(root, inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + btrfs_update_root_times(trans, root); + ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); @@ -4723,6 +4725,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, inode); + btrfs_update_root_times(trans, root); + return inode; fail: if (dir) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7011871c45b..99fe2ce7f72 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -346,11 +347,13 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_root *new_root; struct dentry *parent = dentry->d_parent; struct inode *dir; + struct timespec cur_time = CURRENT_TIME; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + uuid_le new_uuid; ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) @@ -389,8 +392,9 @@ static noinline int create_subvol(struct btrfs_root *root, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); + memset(&root_item, 0, sizeof(root_item)); + inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); inode_item->generation = cpu_to_le64(1); inode_item->size = cpu_to_le64(3); inode_item->nlink = cpu_to_le32(1); @@ -408,8 +412,15 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_used(&root_item, leaf->len); btrfs_set_root_last_snapshot(&root_item, 0); - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; + btrfs_set_root_generation_v2(&root_item, + btrfs_root_generation(&root_item)); + uuid_le_gen(&new_uuid); + memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); + root_item.otime.sec = cpu_to_le64(cur_time.tv_sec); + root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec); + root_item.ctime = root_item.otime; + btrfs_set_root_ctransid(&root_item, trans->transid); + btrfs_set_root_otransid(&root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); @@ -3395,6 +3406,87 @@ out: return ret; } +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root_item *root_item = &root->root_item; + struct btrfs_trans_handle *trans; + struct timespec ct = CURRENT_TIME; + int ret = 0; + + ret = mnt_want_write_file(file); + if (ret < 0) + return ret; + + down_write(&root->fs_info->subvol_sem); + + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { + ret = -EINVAL; + goto out; + } + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; + } + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto out; + } + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + sa = NULL; + goto out; + } + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + sa->rtransid = trans->transid; + sa->rtime.sec = ct.tv_sec; + sa->rtime.nsec = ct.tv_nsec; + + memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); + btrfs_set_root_stransid(root_item, sa->stransid); + btrfs_set_root_rtransid(root_item, sa->rtransid); + root_item->stime.sec = cpu_to_le64(sa->stime.sec); + root_item->stime.nsec = cpu_to_le32(sa->stime.nsec); + root_item->rtime.sec = cpu_to_le64(sa->rtime.sec); + root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec); + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans, root); + trans = NULL; + goto out; + } else { + ret = btrfs_commit_transaction(trans, root); + if (ret < 0) + goto out; + } + + ret = copy_to_user(arg, sa, sizeof(*sa)); + if (ret) + ret = -EFAULT; + +out: + kfree(sa); + up_write(&root->fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3477,6 +3569,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_ctl(root, arg); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root, argp); + case BTRFS_IOC_SET_RECEIVED_SUBVOL: + return btrfs_ioctl_set_received_subvol(file, argp); case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp, 0); case BTRFS_IOC_GET_AND_RESET_DEV_STATS: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e440aa653c3..0c505d7ff8e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -295,6 +295,21 @@ struct btrfs_ioctl_get_dev_stats { __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ }; +struct btrfs_ioctl_timespec { + __u64 sec; + __u32 nsec; +}; + +struct btrfs_ioctl_received_subvol_args { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec stime; /* in */ + struct btrfs_ioctl_timespec rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -359,6 +374,8 @@ struct btrfs_ioctl_get_dev_stats { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) #define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 24fb8ce4e07..6bb465cca20 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -16,11 +16,54 @@ * Boston, MA 021110-1307, USA. */ +#include #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +/* + * Read a root item from the tree. In case we detect a root item smaller then + * sizeof(root_item), we know it's an old version of the root structure and + * initialize all new fields to zero. The same happens if we detect mismatching + * generation numbers as then we know the root was once mounted with an older + * kernel that was not aware of the root item structure change. + */ +void btrfs_read_root_item(struct btrfs_root *root, + struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) +{ + uuid_le uuid; + int len; + int need_reset = 0; + + len = btrfs_item_size_nr(eb, slot); + read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), + min_t(int, len, (int)sizeof(*item))); + if (len < sizeof(*item)) + need_reset = 1; + if (!need_reset && btrfs_root_generation(item) + != btrfs_root_generation_v2(item)) { + if (btrfs_root_generation_v2(item) != 0) { + printk(KERN_WARNING "btrfs: mismatching " + "generation and generation_v2 " + "found in root item. This root " + "was probably mounted with an " + "older kernel. Resetting all " + "new fields.\n"); + } + need_reset = 1; + } + if (need_reset) { + memset(&item->generation_v2, 0, + sizeof(*item) - offsetof(struct btrfs_root_item, + generation_v2)); + + uuid_le_gen(&uuid); + memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); + } +} + /* * lookup the root with the highest offset for a given objectid. The key we do * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 @@ -61,10 +104,10 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, goto out; } if (item) - read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), - sizeof(*item)); + btrfs_read_root_item(root, l, slot, item); if (key) memcpy(key, &found_key, sizeof(found_key)); + ret = 0; out: btrfs_free_path(path); @@ -91,16 +134,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int slot; unsigned long ptr; + int old_len; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } + if (ret < 0) + goto out_abort; if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); @@ -113,16 +155,56 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); + old_len = btrfs_item_size_nr(l, slot); + + /* + * If this is the first time we update the root item which originated + * from an older kernel, we need to enlarge the item size to make room + * for the added fields. + */ + if (old_len < sizeof(*item)) { + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, key, path, + -1, 1); + if (ret < 0) + goto out_abort; + ret = btrfs_del_item(trans, root, path); + if (ret < 0) + goto out_abort; + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, + key, sizeof(*item)); + if (ret < 0) + goto out_abort; + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + } + + /* + * Update generation_v2 so at the next mount we know the new root + * fields are valid. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); + write_extent_buffer(l, item, ptr, sizeof(*item)); btrfs_mark_buffer_dirty(path->nodes[0]); out: btrfs_free_path(path); return ret; + +out_abort: + btrfs_abort_transaction(trans, root, ret); + goto out; } int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item) { + /* + * Make sure generation v1 and v2 match. See update_root for details. + */ + btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } @@ -454,3 +536,16 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) root_item->byte_limit = 0; } } + +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root_item *item = &root->root_item; + struct timespec ct = CURRENT_TIME; + + spin_lock(&root->root_times_lock); + item->ctransid = trans->transid; + item->ctime.sec = cpu_to_le64(ct.tv_sec); + item->ctime.nsec = cpu_to_le64(ct.tv_nsec); + spin_unlock(&root->root_times_lock); +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b72b068183e..a21f3085a33 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -926,11 +927,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; + struct timespec cur_time = CURRENT_TIME; int ret; u64 to_reserve = 0; u64 index = 0; u64 objectid; u64 root_flags; + uuid_le new_uuid; rsv = trans->block_rsv; @@ -1016,6 +1019,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; btrfs_set_root_flags(new_root_item, root_flags); + btrfs_set_root_generation_v2(new_root_item, + trans->transid); + uuid_le_gen(&new_uuid); + memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + memcpy(new_root_item->parent_uuid, root->root_item.uuid, + BTRFS_UUID_SIZE); + new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); + new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec); + btrfs_set_root_otransid(new_root_item, trans->transid); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); if (ret) { -- cgit v1.2.3 From 7069830a9e381e33d44ded45095f764844c71d24 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 5 Jun 2012 21:07:48 +0200 Subject: Btrfs: add btrfs_compare_trees function This function is used to find the differences between two trees. The tree compare skips whole subtrees if it detects shared tree blocks and thus is pretty fast. Signed-off-by: Alexander Block Reviewed-by: David Sterba Reviewed-by: Arne Jansen Reviewed-by: Jan Schmidt Reviewed-by: Alex Lyakas --- fs/btrfs/ctree.c | 425 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 15 ++ 2 files changed, 440 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c82a9e4a953..4c10fd19d48 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5005,6 +5005,431 @@ out: return ret; } +static void tree_move_down(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], + path->slots[*level]); + path->slots[*level - 1] = 0; + (*level)--; +} + +static int tree_move_next_or_upnext(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] == nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(root, path, level, root_level); + } else { + tree_move_down(root, path, level, root_level); + ret = 0; + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_root *left_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + int ret; + int cmp; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_start_ctransid; + u64 right_start_ctransid; + u64 ctransid; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + spin_lock(&left_root->root_times_lock); + left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); + spin_unlock(&left_root->root_times_lock); + + spin_lock(&right_root->root_times_lock); + right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); + spin_unlock(&right_root->root_times_lock); + + trans = btrfs_join_transaction(left_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = left_root->commit_root; + extent_buffer_get(left_path->nodes[left_level]); + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = right_root->commit_root; + extent_buffer_get(right_path->nodes[right_level]); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + /* + * We need to make sure the transaction does not get committed + * while we do anything on commit roots. This means, we need to + * join and leave transactions for every item that we process. + */ + if (trans && btrfs_should_end_transaction(trans, left_root)) { + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + ret = btrfs_end_transaction(trans, left_root); + trans = NULL; + if (ret < 0) + goto out; + } + /* now rejoin the transaction */ + if (!trans) { + trans = btrfs_join_transaction(left_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + spin_lock(&left_root->root_times_lock); + ctransid = btrfs_root_ctransid(&left_root->root_item); + spin_unlock(&left_root->root_times_lock); + if (ctransid != left_start_ctransid) + left_start_ctransid = 0; + + spin_lock(&right_root->root_times_lock); + ctransid = btrfs_root_ctransid(&right_root->root_item); + spin_unlock(&right_root->root_times_lock); + if (ctransid != right_start_ctransid) + right_start_ctransid = 0; + + if (!left_start_ctransid || !right_start_ctransid) { + WARN(1, KERN_WARNING + "btrfs: btrfs_compare_tree detected " + "a change in one of the trees while " + "iterating. This is probably a " + "bug.\n"); + ret = -EIO; + goto out; + } + + /* + * the commit root may have changed, so start again + * where we stopped + */ + left_path->lowest_level = left_level; + right_path->lowest_level = right_level; + ret = btrfs_search_slot(NULL, left_root, + &left_key, left_path, 0, 0); + if (ret < 0) + goto out; + ret = btrfs_search_slot(NULL, right_root, + &right_key, right_path, 0, 0); + if (ret < 0) + goto out; + } + + if (advance_left && !left_end_reached) { + ret = tree_advance(left_root, left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret < 0) + left_end_reached = ADVANCE; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_root, right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret < 0) + right_end_reached = ADVANCE; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + ret = tree_compare_item(left_root, left_path, + right_path, tmp_buf); + if (ret) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_CHANGED, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kfree(tmp_buf); + + if (trans) { + if (!ret) + ret = btrfs_end_transaction(trans, left_root); + else + btrfs_end_transaction(trans, left_root); + } + + return ret; +} + /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d5f6d745867..2fbbe738cae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2722,6 +2722,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, struct btrfs_path *path, int cache_only, u64 min_trans); +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t cb, void *ctx); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, -- cgit v1.2.3 From 31db9f7c23fbf7e95026143f79645de6507b583b Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 25 Jul 2012 23:19:24 +0200 Subject: Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive This patch introduces the BTRFS_IOC_SEND ioctl that is required for send. It allows btrfs-progs to implement full and incremental sends. Patches for btrfs-progs will follow. Signed-off-by: Alexander Block Reviewed-by: David Sterba Reviewed-by: Arne Jansen Reviewed-by: Jan Schmidt Reviewed-by: Alex Lyakas --- fs/btrfs/Makefile | 2 +- fs/btrfs/ioctl.c | 3 + fs/btrfs/ioctl.h | 10 + fs/btrfs/send.c | 4570 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/send.h | 133 ++ 5 files changed, 4717 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/send.c create mode 100644 fs/btrfs/send.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0c4fa2befae..f740644bb5a 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o + reada.o backref.o ulist.o send.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 99fe2ce7f72..bca6997fdb8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -54,6 +54,7 @@ #include "inode-map.h" #include "backref.h" #include "rcu-string.h" +#include "send.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -3571,6 +3572,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_progress(root, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); + case BTRFS_IOC_SEND: + return btrfs_ioctl_send(file, argp); case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp, 0); case BTRFS_IOC_GET_AND_RESET_DEV_STATS: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 0c505d7ff8e..27097e8bfa3 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -310,6 +310,15 @@ struct btrfs_ioctl_received_subvol_args { __u64 reserved[16]; /* in */ }; +struct btrfs_ioctl_send_args { + __s64 send_fd; /* in */ + __u64 clone_sources_count; /* in */ + __u64 __user *clone_sources; /* in */ + __u64 parent_root; /* in */ + __u64 flags; /* in */ + __u64 reserved[4]; /* in */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -376,6 +385,7 @@ struct btrfs_ioctl_received_subvol_args { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ struct btrfs_ioctl_received_subvol_args) +#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) #define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c new file mode 100644 index 00000000000..5394cb75012 --- /dev/null +++ b/fs/btrfs/send.c @@ -0,0 +1,4570 @@ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "send.h" +#include "backref.h" +#include "locking.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "transaction.h" + +static int g_verbose = 0; + +#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__) + +/* + * A fs_path is a helper to dynamically build path names with unknown size. + * It reallocates the internal buffer on demand. + * It allows fast adding of path elements on the right side (normal path) and + * fast adding to the left side (reversed path). A reversed path can also be + * unreversed if needed. + */ +struct fs_path { + union { + struct { + char *start; + char *end; + char *prepared; + + char *buf; + int buf_len; + int reversed:1; + int virtual_mem:1; + char inline_buf[]; + }; + char pad[PAGE_SIZE]; + }; +}; +#define FS_PATH_INLINE_SIZE \ + (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) + + +/* reused for each extent */ +struct clone_root { + struct btrfs_root *root; + u64 ino; + u64 offset; + + u64 found_refs; +}; + +#define SEND_CTX_MAX_NAME_CACHE_SIZE 128 +#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) + +struct send_ctx { + struct file *send_filp; + loff_t send_off; + char *send_buf; + u32 send_size; + u32 send_max_size; + u64 total_send_size; + u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + + struct vfsmount *mnt; + + struct btrfs_root *send_root; + struct btrfs_root *parent_root; + struct clone_root *clone_roots; + int clone_roots_cnt; + + /* current state of the compare_tree call */ + struct btrfs_path *left_path; + struct btrfs_path *right_path; + struct btrfs_key *cmp_key; + + /* + * infos of the currently processed inode. In case of deleted inodes, + * these are the values from the deleted inode. + */ + u64 cur_ino; + u64 cur_inode_gen; + int cur_inode_new; + int cur_inode_new_gen; + int cur_inode_deleted; + int cur_inode_first_ref_orphan; + u64 cur_inode_size; + u64 cur_inode_mode; + + u64 send_progress; + + struct list_head new_refs; + struct list_head deleted_refs; + + struct radix_tree_root name_cache; + struct list_head name_cache_list; + int name_cache_size; + + struct file *cur_inode_filp; + char *read_buf; +}; + +struct name_cache_entry { + struct list_head list; + struct list_head use_list; + u64 ino; + u64 gen; + u64 parent_ino; + u64 parent_gen; + int ret; + int need_later_update; + int name_len; + char name[]; +}; + +static void fs_path_reset(struct fs_path *p) +{ + if (p->reversed) { + p->start = p->buf + p->buf_len - 1; + p->end = p->start; + *p->start = 0; + } else { + p->start = p->buf; + p->end = p->start; + *p->start = 0; + } +} + +static struct fs_path *fs_path_alloc(struct send_ctx *sctx) +{ + struct fs_path *p; + + p = kmalloc(sizeof(*p), GFP_NOFS); + if (!p) + return NULL; + p->reversed = 0; + p->virtual_mem = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); + return p; +} + +static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) +{ + struct fs_path *p; + + p = fs_path_alloc(sctx); + if (!p) + return NULL; + p->reversed = 1; + fs_path_reset(p); + return p; +} + +static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) +{ + if (!p) + return; + if (p->buf != p->inline_buf) { + if (p->virtual_mem) + vfree(p->buf); + else + kfree(p->buf); + } + kfree(p); +} + +static int fs_path_len(struct fs_path *p) +{ + return p->end - p->start; +} + +static int fs_path_ensure_buf(struct fs_path *p, int len) +{ + char *tmp_buf; + int path_len; + int old_buf_len; + + len++; + + if (p->buf_len >= len) + return 0; + + path_len = p->end - p->start; + old_buf_len = p->buf_len; + len = PAGE_ALIGN(len); + + if (p->buf == p->inline_buf) { + tmp_buf = kmalloc(len, GFP_NOFS); + if (!tmp_buf) { + tmp_buf = vmalloc(len); + if (!tmp_buf) + return -ENOMEM; + p->virtual_mem = 1; + } + memcpy(tmp_buf, p->buf, p->buf_len); + p->buf = tmp_buf; + p->buf_len = len; + } else { + if (p->virtual_mem) { + tmp_buf = vmalloc(len); + if (!tmp_buf) + return -ENOMEM; + memcpy(tmp_buf, p->buf, p->buf_len); + vfree(p->buf); + } else { + tmp_buf = krealloc(p->buf, len, GFP_NOFS); + if (!tmp_buf) { + tmp_buf = vmalloc(len); + if (!tmp_buf) + return -ENOMEM; + memcpy(tmp_buf, p->buf, p->buf_len); + kfree(p->buf); + p->virtual_mem = 1; + } + } + p->buf = tmp_buf; + p->buf_len = len; + } + if (p->reversed) { + tmp_buf = p->buf + old_buf_len - path_len - 1; + p->end = p->buf + p->buf_len - 1; + p->start = p->end - path_len; + memmove(p->start, tmp_buf, path_len + 1); + } else { + p->start = p->buf; + p->end = p->start + path_len; + } + return 0; +} + +static int fs_path_prepare_for_add(struct fs_path *p, int name_len) +{ + int ret; + int new_len; + + new_len = p->end - p->start + name_len; + if (p->start != p->end) + new_len++; + ret = fs_path_ensure_buf(p, new_len); + if (ret < 0) + goto out; + + if (p->reversed) { + if (p->start != p->end) + *--p->start = '/'; + p->start -= name_len; + p->prepared = p->start; + } else { + if (p->start != p->end) + *p->end++ = '/'; + p->prepared = p->end; + p->end += name_len; + *p->end = 0; + } + +out: + return ret; +} + +static int fs_path_add(struct fs_path *p, const char *name, int name_len) +{ + int ret; + + ret = fs_path_prepare_for_add(p, name_len); + if (ret < 0) + goto out; + memcpy(p->prepared, name, name_len); + p->prepared = NULL; + +out: + return ret; +} + +static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +{ + int ret; + + ret = fs_path_prepare_for_add(p, p2->end - p2->start); + if (ret < 0) + goto out; + memcpy(p->prepared, p2->start, p2->end - p2->start); + p->prepared = NULL; + +out: + return ret; +} + +static int fs_path_add_from_extent_buffer(struct fs_path *p, + struct extent_buffer *eb, + unsigned long off, int len) +{ + int ret; + + ret = fs_path_prepare_for_add(p, len); + if (ret < 0) + goto out; + + read_extent_buffer(eb, p->prepared, off, len); + p->prepared = NULL; + +out: + return ret; +} + +static void fs_path_remove(struct fs_path *p) +{ + BUG_ON(p->reversed); + while (p->start != p->end && *p->end != '/') + p->end--; + *p->end = 0; +} + +static int fs_path_copy(struct fs_path *p, struct fs_path *from) +{ + int ret; + + p->reversed = from->reversed; + fs_path_reset(p); + + ret = fs_path_add_path(p, from); + + return ret; +} + + +static void fs_path_unreverse(struct fs_path *p) +{ + char *tmp; + int len; + + if (!p->reversed) + return; + + tmp = p->start; + len = p->end - p->start; + p->start = p->buf; + p->end = p->start + len; + memmove(p->start, tmp, len + 1); + p->reversed = 0; +} + +static struct btrfs_path *alloc_path_for_send(void) +{ + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return NULL; + path->search_commit_root = 1; + path->skip_locking = 1; + return path; +} + +static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) +{ + int ret; + mm_segment_t old_fs; + u32 pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + while (pos < len) { + ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, + &sctx->send_off); + /* TODO handle that correctly */ + /*if (ret == -ERESTARTSYS) { + continue; + }*/ + if (ret < 0) + goto out; + if (ret == 0) { + ret = -EIO; + goto out; + } + pos += ret; + } + + ret = 0; + +out: + set_fs(old_fs); + return ret; +} + +static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) +{ + struct btrfs_tlv_header *hdr; + int total_len = sizeof(*hdr) + len; + int left = sctx->send_max_size - sctx->send_size; + + if (unlikely(left < total_len)) + return -EOVERFLOW; + + hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); + hdr->tlv_type = cpu_to_le16(attr); + hdr->tlv_len = cpu_to_le16(len); + memcpy(hdr + 1, data, len); + sctx->send_size += total_len; + + return 0; +} + +#if 0 +static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) +{ + return tlv_put(sctx, attr, &value, sizeof(value)); +} + +static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) +{ + __le16 tmp = cpu_to_le16(value); + return tlv_put(sctx, attr, &tmp, sizeof(tmp)); +} + +static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value) +{ + __le32 tmp = cpu_to_le32(value); + return tlv_put(sctx, attr, &tmp, sizeof(tmp)); +} +#endif + +static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) +{ + __le64 tmp = cpu_to_le64(value); + return tlv_put(sctx, attr, &tmp, sizeof(tmp)); +} + +static int tlv_put_string(struct send_ctx *sctx, u16 attr, + const char *str, int len) +{ + if (len == -1) + len = strlen(str); + return tlv_put(sctx, attr, str, len); +} + +static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, + const u8 *uuid) +{ + return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); +} + +#if 0 +static int tlv_put_timespec(struct send_ctx *sctx, u16 attr, + struct timespec *ts) +{ + struct btrfs_timespec bts; + bts.sec = cpu_to_le64(ts->tv_sec); + bts.nsec = cpu_to_le32(ts->tv_nsec); + return tlv_put(sctx, attr, &bts, sizeof(bts)); +} +#endif + +static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, + struct extent_buffer *eb, + struct btrfs_timespec *ts) +{ + struct btrfs_timespec bts; + read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); + return tlv_put(sctx, attr, &bts, sizeof(bts)); +} + + +#define TLV_PUT(sctx, attrtype, attrlen, data) \ + do { \ + ret = tlv_put(sctx, attrtype, attrlen, data); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_INT(sctx, attrtype, bits, value) \ + do { \ + ret = tlv_put_u##bits(sctx, attrtype, value); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) +#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) +#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) +#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) +#define TLV_PUT_STRING(sctx, attrtype, str, len) \ + do { \ + ret = tlv_put_string(sctx, attrtype, str, len); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_PATH(sctx, attrtype, p) \ + do { \ + ret = tlv_put_string(sctx, attrtype, p->start, \ + p->end - p->start); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while(0) +#define TLV_PUT_UUID(sctx, attrtype, uuid) \ + do { \ + ret = tlv_put_uuid(sctx, attrtype, uuid); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \ + do { \ + ret = tlv_put_timespec(sctx, attrtype, ts); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) +#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ + do { \ + ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ + if (ret < 0) \ + goto tlv_put_failure; \ + } while (0) + +static int send_header(struct send_ctx *sctx) +{ + struct btrfs_stream_header hdr; + + strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); + + return write_buf(sctx, &hdr, sizeof(hdr)); +} + +/* + * For each command/item we want to send to userspace, we call this function. + */ +static int begin_cmd(struct send_ctx *sctx, int cmd) +{ + struct btrfs_cmd_header *hdr; + + if (!sctx->send_buf) { + WARN_ON(1); + return -EINVAL; + } + + BUG_ON(sctx->send_size); + + sctx->send_size += sizeof(*hdr); + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->cmd = cpu_to_le16(cmd); + + return 0; +} + +static int send_cmd(struct send_ctx *sctx) +{ + int ret; + struct btrfs_cmd_header *hdr; + u32 crc; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); + hdr->crc = 0; + + crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx, sctx->send_buf, sctx->send_size); + + sctx->total_send_size += sctx->send_size; + sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->send_size = 0; + + return ret; +} + +/* + * Sends a move instruction to user space + */ +static int send_rename(struct send_ctx *sctx, + struct fs_path *from, struct fs_path *to) +{ + int ret; + +verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a link instruction to user space + */ +static int send_link(struct send_ctx *sctx, + struct fs_path *path, struct fs_path *lnk) +{ + int ret; + +verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends an unlink instruction to user space + */ +static int send_unlink(struct send_ctx *sctx, struct fs_path *path) +{ + int ret; + +verbose_printk("btrfs: send_unlink %s\n", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Sends a rmdir instruction to user space + */ +static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) +{ + int ret; + +verbose_printk("btrfs: send_rmdir %s\n", path->start); + + ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +/* + * Helper function to retrieve some fields from an inode item. + */ +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid) +{ + int ret; + struct btrfs_inode_item *ii; + struct btrfs_key key; + struct btrfs_path *path; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret) { + ret = -ENOENT; + goto out; + } + + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (size) + *size = btrfs_inode_size(path->nodes[0], ii); + if (gen) + *gen = btrfs_inode_generation(path->nodes[0], ii); + if (mode) + *mode = btrfs_inode_mode(path->nodes[0], ii); + if (uid) + *uid = btrfs_inode_uid(path->nodes[0], ii); + if (gid) + *gid = btrfs_inode_gid(path->nodes[0], ii); + +out: + btrfs_free_path(path); + return ret; +} + +typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, + struct fs_path *p, + void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_inode_ref. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the INODE_REF when called. + */ +static int iterate_inode_ref(struct send_ctx *sctx, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, int resolve, + iterate_inode_ref_t iterate, void *ctx) +{ + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_path *tmp_path; + struct fs_path *p; + u32 cur; + u32 len; + u32 total; + int slot; + u32 name_len; + char *start; + int ret = 0; + int num; + int index; + + p = fs_path_alloc_reversed(sctx); + if (!p) + return -ENOMEM; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) { + fs_path_free(sctx, p); + return -ENOMEM; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item = btrfs_item_nr(eb, slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + cur = 0; + len = 0; + total = btrfs_item_size(eb, item); + + num = 0; + while (cur < total) { + fs_path_reset(p); + + name_len = btrfs_inode_ref_name_len(eb, iref); + index = btrfs_inode_ref_index(eb, iref); + if (resolve) { + start = btrfs_iref_to_path(root, tmp_path, iref, eb, + found_key->offset, p->buf, + p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + if (start < p->buf) { + /* overflow , try again with larger buffer */ + ret = fs_path_ensure_buf(p, + p->buf_len + p->buf - start); + if (ret < 0) + goto out; + start = btrfs_iref_to_path(root, tmp_path, iref, + eb, found_key->offset, p->buf, + p->buf_len); + if (IS_ERR(start)) { + ret = PTR_ERR(start); + goto out; + } + BUG_ON(start < p->buf); + } + p->start = start; + } else { + ret = fs_path_add_from_extent_buffer(p, eb, + (unsigned long)(iref + 1), name_len); + if (ret < 0) + goto out; + } + + + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + cur += len; + + ret = iterate(num, found_key->offset, index, p, ctx); + if (ret) + goto out; + + num++; + } + +out: + btrfs_free_path(tmp_path); + fs_path_free(sctx, p); + return ret; +} + +typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx); + +/* + * Helper function to iterate the entries in ONE btrfs_dir_item. + * The iterate callback may return a non zero value to stop iteration. This can + * be a negative value for error codes or 1 to simply stop it. + * + * path must point to the dir item when called. + */ +static int iterate_dir_item(struct send_ctx *sctx, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *found_key, + iterate_dir_item_t iterate, void *ctx) +{ + int ret = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_path *tmp_path = NULL; + struct btrfs_key di_key; + char *buf = NULL; + char *buf2 = NULL; + int buf_len; + int buf_virtual = 0; + u32 name_len; + u32 data_len; + u32 cur; + u32 len; + u32 total; + int slot; + int num; + u8 type; + + buf_len = PAGE_SIZE; + buf = kmalloc(buf_len, GFP_NOFS); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + tmp_path = alloc_path_for_send(); + if (!tmp_path) { + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item = btrfs_item_nr(eb, slot); + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + cur = 0; + len = 0; + total = btrfs_item_size(eb, item); + + num = 0; + while (cur < total) { + name_len = btrfs_dir_name_len(eb, di); + data_len = btrfs_dir_data_len(eb, di); + type = btrfs_dir_type(eb, di); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (name_len + data_len > buf_len) { + buf_len = PAGE_ALIGN(name_len + data_len); + if (buf_virtual) { + buf2 = vmalloc(buf_len); + if (!buf2) { + ret = -ENOMEM; + goto out; + } + vfree(buf); + } else { + buf2 = krealloc(buf, buf_len, GFP_NOFS); + if (!buf2) { + buf2 = vmalloc(buf_len); + if (!buf2) { + ret = -ENOMEM; + goto out; + } + kfree(buf); + buf_virtual = 1; + } + } + + buf = buf2; + buf2 = NULL; + } + + read_extent_buffer(eb, buf, (unsigned long)(di + 1), + name_len + data_len); + + len = sizeof(*di) + name_len + data_len; + di = (struct btrfs_dir_item *)((char *)di + len); + cur += len; + + ret = iterate(num, &di_key, buf, name_len, buf + name_len, + data_len, type, ctx); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + num++; + } + +out: + btrfs_free_path(tmp_path); + if (buf_virtual) + vfree(buf); + else + kfree(buf); + return ret; +} + +static int __copy_first_ref(int num, u64 dir, int index, + struct fs_path *p, void *ctx) +{ + int ret; + struct fs_path *pt = ctx; + + ret = fs_path_copy(pt, p); + if (ret < 0) + return ret; + + /* we want the first only */ + return 1; +} + +/* + * Retrieve the first path of an inode. If an inode has more then one + * ref/hardlink, this is ignored. + */ +static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, + u64 ino, struct fs_path *path) +{ + int ret; + struct btrfs_key key, found_key; + struct btrfs_path *p; + + p = alloc_path_for_send(); + if (!p) + return -ENOMEM; + + fs_path_reset(path); + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 1; + goto out; + } + btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); + if (found_key.objectid != ino || + found_key.type != BTRFS_INODE_REF_KEY) { + ret = -ENOENT; + goto out; + } + + ret = iterate_inode_ref(sctx, root, p, &found_key, 1, + __copy_first_ref, path); + if (ret < 0) + goto out; + ret = 0; + +out: + btrfs_free_path(p); + return ret; +} + +struct backref_ctx { + struct send_ctx *sctx; + + /* number of total found references */ + u64 found; + + /* + * used for clones found in send_root. clones found behind cur_objectid + * and cur_offset are not considered as allowed clones. + */ + u64 cur_objectid; + u64 cur_offset; + + /* may be truncated in case it's the last extent in a file */ + u64 extent_len; + + /* Just to check for bugs in backref resolving */ + int found_in_send_root; +}; + +static int __clone_root_cmp_bsearch(const void *key, const void *elt) +{ + u64 root = (u64)key; + struct clone_root *cr = (struct clone_root *)elt; + + if (root < cr->root->objectid) + return -1; + if (root > cr->root->objectid) + return 1; + return 0; +} + +static int __clone_root_cmp_sort(const void *e1, const void *e2) +{ + struct clone_root *cr1 = (struct clone_root *)e1; + struct clone_root *cr2 = (struct clone_root *)e2; + + if (cr1->root->objectid < cr2->root->objectid) + return -1; + if (cr1->root->objectid > cr2->root->objectid) + return 1; + return 0; +} + +/* + * Called for every backref that is found for the current extent. + */ +static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +{ + struct backref_ctx *bctx = ctx_; + struct clone_root *found; + int ret; + u64 i_size; + + /* First check if the root is in the list of accepted clone sources */ + found = bsearch((void *)root, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!found) + return 0; + + if (found->root == bctx->sctx->send_root && + ino == bctx->cur_objectid && + offset == bctx->cur_offset) { + bctx->found_in_send_root = 1; + } + + /* + * There are inodes that have extents that lie behind it's i_size. Don't + * accept clones from these extents. + */ + ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); + if (ret < 0) + return ret; + + if (offset + bctx->extent_len > i_size) + return 0; + + /* + * Make sure we don't consider clones from send_root that are + * behind the current inode/offset. + */ + if (found->root == bctx->sctx->send_root) { + /* + * TODO for the moment we don't accept clones from the inode + * that is currently send. We may change this when + * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same + * file. + */ + if (ino >= bctx->cur_objectid) + return 0; + /*if (ino > ctx->cur_objectid) + return 0; + if (offset + ctx->extent_len > ctx->cur_offset) + return 0;*/ + + bctx->found++; + found->found_refs++; + found->ino = ino; + found->offset = offset; + return 0; + } + + bctx->found++; + found->found_refs++; + if (ino < found->ino) { + found->ino = ino; + found->offset = offset; + } else if (found->ino == ino) { + /* + * same extent found more then once in the same file. + */ + if (found->offset > offset + bctx->extent_len) + found->offset = offset; + } + + return 0; +} + +/* + * path must point to the extent item when called. + */ +static int find_extent_clone(struct send_ctx *sctx, + struct btrfs_path *path, + u64 ino, u64 data_offset, + u64 ino_size, + struct clone_root **found) +{ + int ret; + int extent_type; + u64 logical; + u64 num_bytes; + u64 extent_item_pos; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb = path->nodes[0]; + struct backref_ctx backref_ctx; + struct clone_root *cur_clone_root; + struct btrfs_key found_key; + struct btrfs_path *tmp_path; + u32 i; + + tmp_path = alloc_path_for_send(); + if (!tmp_path) + return -ENOMEM; + + if (data_offset >= ino_size) { + /* + * There may be extents that lie behind the file's size. + * I at least had this in combination with snapshotting while + * writing large files. + */ + ret = 0; + goto out; + } + + fi = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -ENOENT; + goto out; + } + + num_bytes = btrfs_file_extent_num_bytes(eb, fi); + logical = btrfs_file_extent_disk_bytenr(eb, fi); + if (logical == 0) { + ret = -ENOENT; + goto out; + } + logical += btrfs_file_extent_offset(eb, fi); + + ret = extent_from_logical(sctx->send_root->fs_info, + logical, tmp_path, &found_key); + btrfs_release_path(tmp_path); + + if (ret < 0) + goto out; + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = -EIO; + goto out; + } + + /* + * Setup the clone roots. + */ + for (i = 0; i < sctx->clone_roots_cnt; i++) { + cur_clone_root = sctx->clone_roots + i; + cur_clone_root->ino = (u64)-1; + cur_clone_root->offset = 0; + cur_clone_root->found_refs = 0; + } + + backref_ctx.sctx = sctx; + backref_ctx.found = 0; + backref_ctx.cur_objectid = ino; + backref_ctx.cur_offset = data_offset; + backref_ctx.found_in_send_root = 0; + backref_ctx.extent_len = num_bytes; + + /* + * The last extent of a file may be too large due to page alignment. + * We need to adjust extent_len in this case so that the checks in + * __iterate_backrefs work. + */ + if (data_offset + num_bytes >= ino_size) + backref_ctx.extent_len = ino_size - data_offset; + + /* + * Now collect all backrefs. + */ + extent_item_pos = logical - found_key.objectid; + ret = iterate_extent_inodes(sctx->send_root->fs_info, + found_key.objectid, extent_item_pos, 1, + __iterate_backrefs, &backref_ctx); + if (ret < 0) + goto out; + + if (!backref_ctx.found_in_send_root) { + /* found a bug in backref code? */ + ret = -EIO; + printk(KERN_ERR "btrfs: ERROR did not find backref in " + "send_root. inode=%llu, offset=%llu, " + "logical=%llu\n", + ino, data_offset, logical); + goto out; + } + +verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " + "ino=%llu, " + "num_bytes=%llu, logical=%llu\n", + data_offset, ino, num_bytes, logical); + + if (!backref_ctx.found) + verbose_printk("btrfs: no clones found\n"); + + cur_clone_root = NULL; + for (i = 0; i < sctx->clone_roots_cnt; i++) { + if (sctx->clone_roots[i].found_refs) { + if (!cur_clone_root) + cur_clone_root = sctx->clone_roots + i; + else if (sctx->clone_roots[i].root == sctx->send_root) + /* prefer clones from send_root over others */ + cur_clone_root = sctx->clone_roots + i; + break; + } + + } + + if (cur_clone_root) { + *found = cur_clone_root; + ret = 0; + } else { + ret = -ENOENT; + } + +out: + btrfs_free_path(tmp_path); + return ret; +} + +static int read_symlink(struct send_ctx *sctx, + struct btrfs_root *root, + u64 ino, + struct fs_path *dest) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u8 type; + u8 compression; + unsigned long off; + int len; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + compression = btrfs_file_extent_compression(path->nodes[0], ei); + BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); + BUG_ON(compression); + + off = btrfs_file_extent_inline_start(ei); + len = btrfs_file_extent_inline_len(path->nodes[0], ei); + + ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); + if (ret < 0) + goto out; + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Helper function to generate a file name that is unique in the root of + * send_root and parent_root. This is used to generate names for orphan inodes. + */ +static int gen_unique_name(struct send_ctx *sctx, + u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *di; + char tmp[64]; + int len; + u64 idx = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + while (1) { + len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", + ino, gen, idx); + if (len >= sizeof(tmp)) { + /* should really not happen */ + ret = -EOVERFLOW; + goto out; + } + + di = btrfs_lookup_dir_item(NULL, sctx->send_root, + path, BTRFS_FIRST_FREE_OBJECTID, + tmp, strlen(tmp), 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + + if (!sctx->parent_root) { + /* unique */ + ret = 0; + break; + } + + di = btrfs_lookup_dir_item(NULL, sctx->parent_root, + path, BTRFS_FIRST_FREE_OBJECTID, + tmp, strlen(tmp), 0); + btrfs_release_path(path); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (di) { + /* not unique, try again */ + idx++; + continue; + } + /* unique */ + break; + } + + ret = fs_path_add(dest, tmp, strlen(tmp)); + +out: + btrfs_free_path(path); + return ret; +} + +enum inode_state { + inode_state_no_change, + inode_state_will_create, + inode_state_did_create, + inode_state_will_delete, + inode_state_did_delete, +}; + +static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret; + int left_ret; + int right_ret; + u64 left_gen; + u64 right_gen; + + ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, + NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + left_ret = ret; + + if (!sctx->parent_root) { + right_ret = -ENOENT; + } else { + ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + right_ret = ret; + } + + if (!left_ret && !right_ret) { + if (left_gen == gen && right_gen == gen) + ret = inode_state_no_change; + else if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else if (!left_ret) { + if (left_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_create; + else + ret = inode_state_will_create; + } else { + ret = -ENOENT; + } + } else if (!right_ret) { + if (right_gen == gen) { + if (ino < sctx->send_progress) + ret = inode_state_did_delete; + else + ret = inode_state_will_delete; + } else { + ret = -ENOENT; + } + } else { + ret = -ENOENT; + } + +out: + return ret; +} + +static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret; + + ret = get_cur_inode_state(sctx, ino, gen); + if (ret < 0) + goto out; + + if (ret == inode_state_no_change || + ret == inode_state_did_create || + ret == inode_state_will_delete) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +/* + * Helper function to lookup a dir item in a dir. + */ +static int lookup_dir_item_inode(struct btrfs_root *root, + u64 dir, const char *name, int name_len, + u64 *found_inode, + u8 *found_type) +{ + int ret = 0; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_path *path; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(NULL, root, path, + dir, name, name_len, 0); + if (!di) { + ret = -ENOENT; + goto out; + } + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + *found_inode = key.objectid; + *found_type = btrfs_dir_type(path->nodes[0], di); + +out: + btrfs_free_path(path); + return ret; +} + +static int get_first_ref(struct send_ctx *sctx, + struct btrfs_root *root, u64 ino, + u64 *dir, u64 *dir_gen, struct fs_path *name) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct btrfs_inode_ref *iref; + int len; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (!ret) + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = -ENOENT; + goto out; + } + + iref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], iref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)(iref + 1), len); + if (ret < 0) + goto out; + btrfs_release_path(path); + + ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, + NULL); + if (ret < 0) + goto out; + + *dir = found_key.offset; + +out: + btrfs_free_path(path); + return ret; +} + +static int is_first_ref(struct send_ctx *sctx, + struct btrfs_root *root, + u64 ino, u64 dir, + const char *name, int name_len) +{ + int ret; + struct fs_path *tmp_name; + u64 tmp_dir; + u64 tmp_dir_gen; + + tmp_name = fs_path_alloc(sctx); + if (!tmp_name) + return -ENOMEM; + + ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + if (ret < 0) + goto out; + + if (name_len != fs_path_len(tmp_name)) { + ret = 0; + goto out; + } + + ret = memcmp(tmp_name->start, name, name_len); + if (ret) + ret = 0; + else + ret = 1; + +out: + fs_path_free(sctx, tmp_name); + return ret; +} + +static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, + const char *name, int name_len, + u64 *who_ino, u64 *who_gen) +{ + int ret = 0; + u64 other_inode = 0; + u8 other_type = 0; + + if (!sctx->parent_root) + goto out; + + ret = is_inode_existent(sctx, dir, dir_gen); + if (ret <= 0) + goto out; + + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, + &other_inode, &other_type); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + + if (other_inode > sctx->send_progress) { + ret = get_inode_info(sctx->parent_root, other_inode, NULL, + who_gen, NULL, NULL, NULL); + if (ret < 0) + goto out; + + ret = 1; + *who_ino = other_inode; + } else { + ret = 0; + } + +out: + return ret; +} + +static int did_overwrite_ref(struct send_ctx *sctx, + u64 dir, u64 dir_gen, + u64 ino, u64 ino_gen, + const char *name, int name_len) +{ + int ret = 0; + u64 gen; + u64 ow_inode; + u8 other_type; + + if (!sctx->parent_root) + goto out; + + ret = is_inode_existent(sctx, dir, dir_gen); + if (ret <= 0) + goto out; + + /* check if the ref was overwritten by another ref */ + ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, + &ow_inode, &other_type); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + /* was never and will never be overwritten */ + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, + NULL); + if (ret < 0) + goto out; + + if (ow_inode == ino && gen == ino_gen) { + ret = 0; + goto out; + } + + /* we know that it is or will be overwritten. check this now */ + if (ow_inode < sctx->send_progress) + ret = 1; + else + ret = 0; + +out: + return ret; +} + +static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 dir; + u64 dir_gen; + + if (!sctx->parent_root) + goto out; + + name = fs_path_alloc(sctx); + if (!name) + return -ENOMEM; + + ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); + if (ret < 0) + goto out; + + ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, + name->start, fs_path_len(name)); + if (ret < 0) + goto out; + +out: + fs_path_free(sctx, name); + return ret; +} + +static int name_cache_insert(struct send_ctx *sctx, + struct name_cache_entry *nce) +{ + int ret = 0; + struct name_cache_entry **ncea; + + ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); + if (ncea) { + if (!ncea[0]) + ncea[0] = nce; + else if (!ncea[1]) + ncea[1] = nce; + else + BUG(); + } else { + ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS); + if (!ncea) + return -ENOMEM; + + ncea[0] = nce; + ncea[1] = NULL; + ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); + if (ret < 0) + return ret; + } + list_add_tail(&nce->list, &sctx->name_cache_list); + sctx->name_cache_size++; + + return ret; +} + +static void name_cache_delete(struct send_ctx *sctx, + struct name_cache_entry *nce) +{ + struct name_cache_entry **ncea; + + ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); + BUG_ON(!ncea); + + if (ncea[0] == nce) + ncea[0] = NULL; + else if (ncea[1] == nce) + ncea[1] = NULL; + else + BUG(); + + if (!ncea[0] && !ncea[1]) { + radix_tree_delete(&sctx->name_cache, nce->ino); + kfree(ncea); + } + + list_del(&nce->list); + + sctx->name_cache_size--; +} + +static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, + u64 ino, u64 gen) +{ + struct name_cache_entry **ncea; + + ncea = radix_tree_lookup(&sctx->name_cache, ino); + if (!ncea) + return NULL; + + if (ncea[0] && ncea[0]->gen == gen) + return ncea[0]; + else if (ncea[1] && ncea[1]->gen == gen) + return ncea[1]; + return NULL; +} + +static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) +{ + list_del(&nce->list); + list_add_tail(&nce->list, &sctx->name_cache_list); +} + +static void name_cache_clean_unused(struct send_ctx *sctx) +{ + struct name_cache_entry *nce; + + if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE) + return; + + while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); + name_cache_delete(sctx, nce); + kfree(nce); + } +} + +static void name_cache_free(struct send_ctx *sctx) +{ + struct name_cache_entry *nce; + struct name_cache_entry *tmp; + + list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { + name_cache_delete(sctx, nce); + } +} + +static int __get_cur_name_and_parent(struct send_ctx *sctx, + u64 ino, u64 gen, + u64 *parent_ino, + u64 *parent_gen, + struct fs_path *dest) +{ + int ret; + int nce_ret; + struct btrfs_path *path = NULL; + struct name_cache_entry *nce = NULL; + + nce = name_cache_search(sctx, ino, gen); + if (nce) { + if (ino < sctx->send_progress && nce->need_later_update) { + name_cache_delete(sctx, nce); + kfree(nce); + nce = NULL; + } else { + name_cache_used(sctx, nce); + *parent_ino = nce->parent_ino; + *parent_gen = nce->parent_gen; + ret = fs_path_add(dest, nce->name, nce->name_len); + if (ret < 0) + goto out; + ret = nce->ret; + goto out; + } + } + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + ret = is_inode_existent(sctx, ino, gen); + if (ret < 0) + goto out; + + if (!ret) { + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + goto out_cache; + } + + if (ino < sctx->send_progress) + ret = get_first_ref(sctx, sctx->send_root, ino, + parent_ino, parent_gen, dest); + else + ret = get_first_ref(sctx, sctx->parent_root, ino, + parent_ino, parent_gen, dest); + if (ret < 0) + goto out; + + ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, + dest->start, dest->end - dest->start); + if (ret < 0) + goto out; + if (ret) { + fs_path_reset(dest); + ret = gen_unique_name(sctx, ino, gen, dest); + if (ret < 0) + goto out; + ret = 1; + } + +out_cache: + nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); + if (!nce) { + ret = -ENOMEM; + goto out; + } + + nce->ino = ino; + nce->gen = gen; + nce->parent_ino = *parent_ino; + nce->parent_gen = *parent_gen; + nce->name_len = fs_path_len(dest); + nce->ret = ret; + strcpy(nce->name, dest->start); + memset(&nce->use_list, 0, sizeof(nce->use_list)); + + if (ino < sctx->send_progress) + nce->need_later_update = 0; + else + nce->need_later_update = 1; + + nce_ret = name_cache_insert(sctx, nce); + if (nce_ret < 0) + ret = nce_ret; + name_cache_clean_unused(sctx); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Magic happens here. This function returns the first ref to an inode as it + * would look like while receiving the stream at this point in time. + * We walk the path up to the root. For every inode in between, we check if it + * was already processed/sent. If yes, we continue with the parent as found + * in send_root. If not, we continue with the parent as found in parent_root. + * If we encounter an inode that was deleted at this point in time, we use the + * inodes "orphan" name instead of the real name and stop. Same with new inodes + * that were not created yet and overwritten inodes/refs. + * + * When do we have have orphan inodes: + * 1. When an inode is freshly created and thus no valid refs are available yet + * 2. When a directory lost all it's refs (deleted) but still has dir items + * inside which were not processed yet (pending for move/delete). If anyone + * tried to get the path to the dir items, it would get a path inside that + * orphan directory. + * 3. When an inode is moved around or gets new links, it may overwrite the ref + * of an unprocessed inode. If in that case the first ref would be + * overwritten, the overwritten inode gets "orphanized". Later when we + * process this overwritten inode, it is restored at a new place by moving + * the orphan inode. + * + * sctx->send_progress tells this function at which point in time receiving + * would be. + */ +static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *dest) +{ + int ret = 0; + struct fs_path *name = NULL; + u64 parent_inode = 0; + u64 parent_gen = 0; + int stop = 0; + + name = fs_path_alloc(sctx); + if (!name) { + ret = -ENOMEM; + goto out; + } + + dest->reversed = 1; + fs_path_reset(dest); + + while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, &parent_gen, name); + if (ret < 0) + goto out; + if (ret) + stop = 1; + + ret = fs_path_add_path(dest, name); + if (ret < 0) + goto out; + + ino = parent_inode; + gen = parent_gen; + } + +out: + fs_path_free(sctx, name); + if (!ret) + fs_path_unreverse(dest); + return ret; +} + +/* + * Called for regular files when sending extents data. Opens a struct file + * to read from the file. + */ +static int open_cur_inode_file(struct send_ctx *sctx) +{ + int ret = 0; + struct btrfs_key key; + struct vfsmount *mnt; + struct inode *inode; + struct dentry *dentry; + struct file *filp; + int new = 0; + + if (sctx->cur_inode_filp) + goto out; + + key.objectid = sctx->cur_ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root, + &new); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + dentry = d_obtain_alias(inode); + inode = NULL; + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + mnt = mntget(sctx->mnt); + filp = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, current_cred()); + dentry = NULL; + mnt = NULL; + if (IS_ERR(filp)) { + ret = PTR_ERR(filp); + goto out; + } + sctx->cur_inode_filp = filp; + +out: + /* + * no xxxput required here as every vfs op + * does it by itself on failure + */ + return ret; +} + +/* + * Closes the struct file that was created in open_cur_inode_file + */ +static int close_cur_inode_file(struct send_ctx *sctx) +{ + int ret = 0; + + if (!sctx->cur_inode_filp) + goto out; + + ret = filp_close(sctx->cur_inode_filp, NULL); + sctx->cur_inode_filp = NULL; + +out: + return ret; +} + +/* + * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace + */ +static int send_subvol_begin(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_root *parent_root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + char *name = NULL; + int namelen; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS); + if (!name) { + btrfs_free_path(path); + return -ENOMEM; + } + + key.objectid = send_root->objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, + &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type != BTRFS_ROOT_BACKREF_KEY || + key.objectid != send_root->objectid) { + ret = -ENOENT; + goto out; + } + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + namelen = btrfs_root_ref_name_len(leaf, ref); + read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); + btrfs_release_path(path); + + if (ret < 0) + goto out; + + if (parent_root) { + ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); + if (ret < 0) + goto out; + } else { + ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); + if (ret < 0) + goto out; + } + + TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, + sctx->send_root->root_item.ctransid); + if (parent_root) { + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + sctx->parent_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + sctx->parent_root->root_item.ctransid); + } + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + btrfs_free_path(path); + kfree(name); + return ret; +} + +static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + +static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + +static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) +{ + int ret = 0; + struct fs_path *p; + +verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + +static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) +{ + int ret = 0; + struct fs_path *p = NULL; + struct btrfs_inode_item *ii; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + struct btrfs_key key; + int slot; + +verbose_printk("btrfs: send_utimes %llu\n", ino); + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + eb = path->nodes[0]; + slot = path->slots[0]; + ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + + ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, + btrfs_inode_atime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, + btrfs_inode_mtime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, + btrfs_inode_ctime(ii)); + /* TODO otime? */ + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + btrfs_free_path(path); + return ret; +} + +/* + * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have + * a valid path yet because we did not process the refs yet. So, the inode + * is created as orphan. + */ +static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + int ret = 0; + struct extent_buffer *eb = path->nodes[0]; + struct btrfs_inode_item *ii; + struct fs_path *p; + int slot = path->slots[0]; + int cmd; + u64 mode; + +verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, ii); + + if (S_ISREG(mode)) + cmd = BTRFS_SEND_C_MKFILE; + else if (S_ISDIR(mode)) + cmd = BTRFS_SEND_C_MKDIR; + else if (S_ISLNK(mode)) + cmd = BTRFS_SEND_C_SYMLINK; + else if (S_ISCHR(mode) || S_ISBLK(mode)) + cmd = BTRFS_SEND_C_MKNOD; + else if (S_ISFIFO(mode)) + cmd = BTRFS_SEND_C_MKFIFO; + else if (S_ISSOCK(mode)) + cmd = BTRFS_SEND_C_MKSOCK; + else { + printk(KERN_WARNING "btrfs: unexpected inode type %o", + (int)(mode & S_IFMT)); + ret = -ENOTSUPP; + goto out; + } + + ret = begin_cmd(sctx, cmd); + if (ret < 0) + goto out; + + ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); + + if (S_ISLNK(mode)) { + fs_path_reset(p); + ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); + } + + ret = send_cmd(sctx); + if (ret < 0) + goto out; + + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + +struct recorded_ref { + struct list_head list; + char *dir_path; + char *name; + struct fs_path *full_path; + u64 dir; + u64 dir_gen; + int dir_path_len; + int name_len; +}; + +/* + * We need to process new refs before deleted refs, but compare_tree gives us + * everything mixed. So we first record all refs and later process them. + * This function is a helper to record one ref. + */ +static int record_ref(struct list_head *head, u64 dir, + u64 dir_gen, struct fs_path *path) +{ + struct recorded_ref *ref; + char *tmp; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->dir = dir; + ref->dir_gen = dir_gen; + ref->full_path = path; + + tmp = strrchr(ref->full_path->start, '/'); + if (!tmp) { + ref->name_len = ref->full_path->end - ref->full_path->start; + ref->name = ref->full_path->start; + ref->dir_path_len = 0; + ref->dir_path = ref->full_path->start; + } else { + tmp++; + ref->name_len = ref->full_path->end - tmp; + ref->name = tmp; + ref->dir_path = ref->full_path->start; + ref->dir_path_len = ref->full_path->end - + ref->full_path->start - 1 - ref->name_len; + } + + list_add_tail(&ref->list, head); + return 0; +} + +static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) +{ + struct recorded_ref *cur; + struct recorded_ref *tmp; + + list_for_each_entry_safe(cur, tmp, head, list) { + fs_path_free(sctx, cur->full_path); + kfree(cur); + } + INIT_LIST_HEAD(head); +} + +static void free_recorded_refs(struct send_ctx *sctx) +{ + __free_recorded_refs(sctx, &sctx->new_refs); + __free_recorded_refs(sctx, &sctx->deleted_refs); +} + +/* + * Renames/moves a file/dir to it's orphan name. Used when the first + * ref of an unprocessed inode gets overwritten and for all non empty + * directories. + */ +static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, + struct fs_path *path) +{ + int ret; + struct fs_path *orphan; + + orphan = fs_path_alloc(sctx); + if (!orphan) + return -ENOMEM; + + ret = gen_unique_name(sctx, ino, gen, orphan); + if (ret < 0) + goto out; + + ret = send_rename(sctx, path, orphan); + +out: + fs_path_free(sctx, orphan); + return ret; +} + +/* + * Returns 1 if a directory can be removed at this point in time. + * We check this by iterating all dir items and checking if the inode behind + * the dir item was already processed. + */ +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) +{ + int ret = 0; + struct btrfs_root *root = sctx->parent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key loc; + struct btrfs_dir_item *di; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (!ret) { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + break; + } + + di = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + + if (loc.objectid > send_progress) { + ret = 0; + goto out; + } + + btrfs_release_path(path); + key.offset = found_key.offset + 1; + } + + ret = 1; + +out: + btrfs_free_path(path); + return ret; +} + +struct finish_unordered_dir_ctx { + struct send_ctx *sctx; + struct fs_path *cur_path; + struct fs_path *dir_path; + u64 dir_ino; + int need_delete; + int delete_pass; +}; + +int __finish_unordered_dir(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret = 0; + struct finish_unordered_dir_ctx *fctx = ctx; + struct send_ctx *sctx = fctx->sctx; + u64 di_gen; + u64 di_mode; + int is_orphan = 0; + + if (di_key->objectid >= fctx->dir_ino) + goto out; + + fs_path_reset(fctx->cur_path); + + ret = get_inode_info(sctx->send_root, di_key->objectid, + NULL, &di_gen, &di_mode, NULL, NULL); + if (ret < 0) + goto out; + + ret = is_first_ref(sctx, sctx->send_root, di_key->objectid, + fctx->dir_ino, name, name_len); + if (ret < 0) + goto out; + if (ret) { + is_orphan = 1; + ret = gen_unique_name(sctx, di_key->objectid, di_gen, + fctx->cur_path); + } else { + ret = get_cur_path(sctx, di_key->objectid, di_gen, + fctx->cur_path); + } + if (ret < 0) + goto out; + + ret = fs_path_add(fctx->dir_path, name, name_len); + if (ret < 0) + goto out; + + if (!fctx->delete_pass) { + if (S_ISDIR(di_mode)) { + ret = send_rename(sctx, fctx->cur_path, + fctx->dir_path); + } else { + ret = send_link(sctx, fctx->dir_path, + fctx->cur_path); + if (is_orphan) + fctx->need_delete = 1; + } + } else if (!S_ISDIR(di_mode)) { + ret = send_unlink(sctx, fctx->cur_path); + } else { + ret = 0; + } + + fs_path_remove(fctx->dir_path); + +out: + return ret; +} + +/* + * Go through all dir items and see if we find refs which could not be created + * in the past because the dir did not exist at that time. + */ +static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen) +{ + int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct finish_unordered_dir_ctx fctx; + int slot; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + memset(&fctx, 0, sizeof(fctx)); + fctx.sctx = sctx; + fctx.cur_path = fs_path_alloc(sctx); + fctx.dir_path = fs_path_alloc(sctx); + if (!fctx.cur_path || !fctx.dir_path) { + ret = -ENOMEM; + goto out; + } + fctx.dir_ino = dir; + + ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path); + if (ret < 0) + goto out; + + /* + * We do two passes. The first links in the new refs and the second + * deletes orphans if required. Deletion of orphans is not required for + * directory inodes, as we always have only one ref and use rename + * instead of link for those. + */ + +again: + key.objectid = dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = 0; + while (1) { + ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, + 1, 0); + if (ret < 0) + goto out; + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + btrfs_release_path(path); + break; + } + + ret = iterate_dir_item(sctx, sctx->send_root, path, + &found_key, __finish_unordered_dir, + &fctx); + if (ret < 0) + goto out; + + key.offset = found_key.offset + 1; + btrfs_release_path(path); + } + + if (!fctx.delete_pass && fctx.need_delete) { + fctx.delete_pass = 1; + goto again; + } + +out: + btrfs_free_path(path); + fs_path_free(sctx, fctx.cur_path); + fs_path_free(sctx, fctx.dir_path); + return ret; +} + +/* + * This does all the move/link/unlink/rmdir magic. + */ +static int process_recorded_refs(struct send_ctx *sctx) +{ + int ret = 0; + struct recorded_ref *cur; + struct ulist *check_dirs = NULL; + struct ulist_iterator uit; + struct ulist_node *un; + struct fs_path *valid_path = NULL; + u64 ow_inode; + u64 ow_gen; + int did_overwrite = 0; + int is_orphan = 0; + +verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); + + valid_path = fs_path_alloc(sctx); + if (!valid_path) { + ret = -ENOMEM; + goto out; + } + + check_dirs = ulist_alloc(GFP_NOFS); + if (!check_dirs) { + ret = -ENOMEM; + goto out; + } + + /* + * First, check if the first ref of the current inode was overwritten + * before. If yes, we know that the current inode was already orphanized + * and thus use the orphan name. If not, we can use get_cur_path to + * get the path of the first ref as it would like while receiving at + * this point in time. + * New inodes are always orphan at the beginning, so force to use the + * orphan name in this case. + * The first ref is stored in valid_path and will be updated if it + * gets moved around. + */ + if (!sctx->cur_inode_new) { + ret = did_overwrite_first_ref(sctx, sctx->cur_ino, + sctx->cur_inode_gen); + if (ret < 0) + goto out; + if (ret) + did_overwrite = 1; + } + if (sctx->cur_inode_new || did_overwrite) { + ret = gen_unique_name(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } else { + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + valid_path); + if (ret < 0) + goto out; + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * Check if this new ref would overwrite the first ref of + * another unprocessed inode. If yes, orphanize the + * overwritten inode. If we find an overwritten ref that is + * not the first ref, simply unlink it. + */ + ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, + cur->name, cur->name_len, + &ow_inode, &ow_gen); + if (ret < 0) + goto out; + if (ret) { + ret = is_first_ref(sctx, sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); + if (ret < 0) + goto out; + if (ret) { + ret = orphanize_inode(sctx, ow_inode, ow_gen, + cur->full_path); + if (ret < 0) + goto out; + } else { + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + } + + /* + * link/move the ref to the new place. If we have an orphan + * inode, move it and update valid_path. If not, link or move + * it depending on the inode mode. + */ + if (is_orphan && !sctx->cur_inode_first_ref_orphan) { + ret = send_rename(sctx, valid_path, cur->full_path); + if (ret < 0) + goto out; + is_orphan = 0; + ret = fs_path_copy(valid_path, cur->full_path); + if (ret < 0) + goto out; + } else { + if (S_ISDIR(sctx->cur_inode_mode)) { + /* + * Dirs can't be linked, so move it. For moved + * dirs, we always have one new and one deleted + * ref. The deleted ref is ignored later. + */ + ret = send_rename(sctx, valid_path, + cur->full_path); + if (ret < 0) + goto out; + ret = fs_path_copy(valid_path, cur->full_path); + if (ret < 0) + goto out; + } else { + ret = send_link(sctx, cur->full_path, + valid_path); + if (ret < 0) + goto out; + } + } + ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, + GFP_NOFS); + if (ret < 0) + goto out; + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { + /* + * Check if we can already rmdir the directory. If not, + * orphanize it. For every dir item inside that gets deleted + * later, we do this check again and rmdir it then if possible. + * See the use of check_dirs for more details. + */ + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + } else if (!is_orphan) { + ret = orphanize_inode(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + } + + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, + GFP_NOFS); + if (ret < 0) + goto out; + } + } else if (!S_ISDIR(sctx->cur_inode_mode)) { + /* + * We have a non dir inode. Go through all deleted refs and + * unlink them if they were not already overwritten by other + * inodes. + */ + list_for_each_entry(cur, &sctx->deleted_refs, list) { + ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino, sctx->cur_inode_gen, + cur->name, cur->name_len); + if (ret < 0) + goto out; + if (!ret) { + /* + * In case the inode was moved to a directory + * that was not created yet (see + * __record_new_ref), we can not unlink the ref + * as it will be needed later when the parent + * directory is created, so that we can move in + * the inode to the new dir. + */ + if (!is_orphan && + sctx->cur_inode_first_ref_orphan) { + ret = orphanize_inode(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + cur->full_path); + if (ret < 0) + goto out; + ret = gen_unique_name(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + valid_path); + if (ret < 0) + goto out; + is_orphan = 1; + + } else { + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + } + ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, + GFP_NOFS); + if (ret < 0) + goto out; + } + + /* + * If the inode is still orphan, unlink the orphan. This may + * happen when a previous inode did overwrite the first ref + * of this inode and no new refs were added for the current + * inode. + * We can however not delete the orphan in case the inode relies + * in a directory that was not created yet (see + * __record_new_ref) + */ + if (is_orphan && !sctx->cur_inode_first_ref_orphan) { + ret = send_unlink(sctx, valid_path); + if (ret < 0) + goto out; + } + } + + /* + * We did collect all parent dirs where cur_inode was once located. We + * now go through all these dirs and check if they are pending for + * deletion and if it's finally possible to perform the rmdir now. + * We also update the inode stats of the parent dirs here. + */ + ULIST_ITER_INIT(&uit); + while ((un = ulist_next(check_dirs, &uit))) { + if (un->val > sctx->cur_ino) + continue; + + ret = get_cur_inode_state(sctx, un->val, un->aux); + if (ret < 0) + goto out; + + if (ret == inode_state_did_create || + ret == inode_state_no_change) { + /* TODO delayed utimes */ + ret = send_utimes(sctx, un->val, un->aux); + if (ret < 0) + goto out; + } else if (ret == inode_state_did_delete) { + ret = can_rmdir(sctx, un->val, sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = get_cur_path(sctx, un->val, un->aux, + valid_path); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, valid_path); + if (ret < 0) + goto out; + } + } + } + + /* + * Current inode is now at it's new position, so we must increase + * send_progress + */ + sctx->send_progress = sctx->cur_ino + 1; + + /* + * We may have a directory here that has pending refs which could not + * be created before (because the dir did not exist before, see + * __record_new_ref). finish_outoforder_dir will link/move the pending + * refs. + */ + if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) { + ret = finish_outoforder_dir(sctx, sctx->cur_ino, + sctx->cur_inode_gen); + if (ret < 0) + goto out; + } + + ret = 0; + +out: + free_recorded_refs(sctx); + ulist_free(check_dirs); + fs_path_free(sctx, valid_path); + return ret; +} + +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct fs_path *p; + u64 gen; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, + NULL); + if (ret < 0) + goto out; + + /* + * The parent may be non-existent at this point in time. This happens + * if the ino of the parent dir is higher then the current ino. In this + * case, we can not process this ref until the parent dir is finally + * created. If we reach the parent dir later, process_recorded_refs + * will go through all dir items and process the refs that could not be + * processed before. In case this is the first ref, we set + * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to + * keep an orphan of the inode so that it later can be used for + * link/move + */ + ret = is_inode_existent(sctx, dir, gen); + if (ret < 0) + goto out; + if (!ret) { + ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir, + name->start, fs_path_len(name)); + if (ret < 0) + goto out; + if (ret) + sctx->cur_inode_first_ref_orphan = 1; + ret = 0; + goto out; + } + + ret = get_cur_path(sctx, dir, gen, p); + if (ret < 0) + goto out; + ret = fs_path_add_path(p, name); + if (ret < 0) + goto out; + + ret = record_ref(&sctx->new_refs, dir, gen, p); + +out: + if (ret) + fs_path_free(sctx, p); + return ret; +} + +static int __record_deleted_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct fs_path *p; + u64 gen; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, + NULL); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, dir, gen, p); + if (ret < 0) + goto out; + ret = fs_path_add_path(p, name); + if (ret < 0) + goto out; + + ret = record_ref(&sctx->deleted_refs, dir, gen, p); + +out: + if (ret) + fs_path_free(sctx, p); + return ret; +} + +static int record_new_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +static int record_deleted_ref(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +struct find_ref_ctx { + u64 dir; + struct fs_path *name; + int found_idx; +}; + +static int __find_iref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx_) +{ + struct find_ref_ctx *ctx = ctx_; + + if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && + strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { + ctx->found_idx = num; + return 1; + } + return 0; +} + +static int find_iref(struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 dir, struct fs_path *name) +{ + int ret; + struct find_ref_ctx ctx; + + ctx.dir = dir; + ctx.name = name; + ctx.found_idx = -1; + + ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + + return ctx.found_idx; +} + +static int __record_changed_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + + ret = find_iref(sctx, sctx->parent_root, sctx->right_path, + sctx->cmp_key, dir, name); + if (ret == -ENOENT) + ret = __record_new_ref(num, dir, index, name, sctx); + else if (ret > 0) + ret = 0; + + return ret; +} + +static int __record_changed_deleted_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + + ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, + dir, name); + if (ret == -ENOENT) + ret = __record_deleted_ref(num, dir, index, name, sctx); + else if (ret > 0) + ret = 0; + + return ret; +} + +static int record_changed_ref(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_changed_new_ref, sctx); + if (ret < 0) + goto out; + ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + if (ret < 0) + goto out; + ret = 0; + +out: + return ret; +} + +/* + * Record and process all refs at once. Needed when an inode changes the + * generation number, which means that it was deleted and recreated. + */ +static int process_all_refs(struct send_ctx *sctx, + enum btrfs_compare_tree_result cmd) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + iterate_inode_ref_t cb; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + if (cmd == BTRFS_COMPARE_TREE_NEW) { + root = sctx->send_root; + cb = __record_new_ref; + } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { + root = sctx->parent_root; + cb = __record_deleted_ref; + } else { + BUG(); + } + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + while (1) { + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) { + btrfs_release_path(path); + goto out; + } + if (ret) { + btrfs_release_path(path); + break; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + btrfs_release_path(path); + break; + } + + ret = iterate_inode_ref(sctx, sctx->parent_root, path, + &found_key, 0, cb, sctx); + btrfs_release_path(path); + if (ret < 0) + goto out; + + key.offset = found_key.offset + 1; + } + + ret = process_recorded_refs(sctx); + +out: + btrfs_free_path(path); + return ret; +} + +static int send_set_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len, + const char *data, int data_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int send_remove_xattr(struct send_ctx *sctx, + struct fs_path *path, + const char *name, int name_len) +{ + int ret = 0; + + ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int __process_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + posix_acl_xattr_header dummy_acl; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + /* + * This hack is needed because empty acl's are stored as zero byte + * data in xattrs. Problem with that is, that receiving these zero byte + * acl's will fail later. To fix this, we send a dummy acl list that + * only contains the version number and no entries. + */ + if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || + !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { + if (data_len == 0) { + dummy_acl.a_version = + cpu_to_le32(POSIX_ACL_XATTR_VERSION); + data = (char *)&dummy_acl; + data_len = sizeof(dummy_acl); + } + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_set_xattr(sctx, p, name, name_len, data, data_len); + +out: + fs_path_free(sctx, p); + return ret; +} + +static int __process_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + struct fs_path *p; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + ret = send_remove_xattr(sctx, p, name, name_len); + +out: + fs_path_free(sctx, p); + return ret; +} + +static int process_new_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); + + return ret; +} + +static int process_deleted_xattr(struct send_ctx *sctx) +{ + int ret; + + ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); + + return ret; +} + +struct find_xattr_ctx { + const char *name; + int name_len; + int found_idx; + char *found_data; + int found_data_len; +}; + +static int __find_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *vctx) +{ + struct find_xattr_ctx *ctx = vctx; + + if (name_len == ctx->name_len && + strncmp(name, ctx->name, name_len) == 0) { + ctx->found_idx = num; + ctx->found_data_len = data_len; + ctx->found_data = kmalloc(data_len, GFP_NOFS); + if (!ctx->found_data) + return -ENOMEM; + memcpy(ctx->found_data, data, data_len); + return 1; + } + return 0; +} + +static int find_xattr(struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + const char *name, int name_len, + char **data, int *data_len) +{ + int ret; + struct find_xattr_ctx ctx; + + ctx.name = name; + ctx.name_len = name_len; + ctx.found_idx = -1; + ctx.found_data = NULL; + ctx.found_data_len = 0; + + ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); + if (ret < 0) + return ret; + + if (ctx.found_idx == -1) + return -ENOENT; + if (data) { + *data = ctx.found_data; + *data_len = ctx.found_data_len; + } else { + kfree(ctx.found_data); + } + return ctx.found_idx; +} + + +static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + char *found_data = NULL; + int found_data_len = 0; + struct fs_path *p = NULL; + + ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); + if (ret == -ENOENT) { + ret = __process_new_xattr(num, di_key, name, name_len, data, + data_len, type, ctx); + } else if (ret >= 0) { + if (data_len != found_data_len || + memcmp(data, found_data, data_len)) { + ret = __process_new_xattr(num, di_key, name, name_len, + data, data_len, type, ctx); + } else { + ret = 0; + } + } + + kfree(found_data); + fs_path_free(sctx, p); + return ret; +} + +static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, + const char *name, int name_len, + const char *data, int data_len, + u8 type, void *ctx) +{ + int ret; + struct send_ctx *sctx = ctx; + + ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); + if (ret == -ENOENT) + ret = __process_deleted_xattr(num, di_key, name, name_len, data, + data_len, type, ctx); + else if (ret >= 0) + ret = 0; + + return ret; +} + +static int process_changed_xattr(struct send_ctx *sctx) +{ + int ret = 0; + + ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_changed_new_xattr, sctx); + if (ret < 0) + goto out; + ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_changed_deleted_xattr, sctx); + +out: + return ret; +} + +static int process_all_new_xattrs(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + root = sctx->send_root; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + while (1) { + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + ret = iterate_dir_item(sctx, root, path, &found_key, + __process_new_xattr, sctx); + if (ret < 0) + goto out; + + btrfs_release_path(path); + key.offset = found_key.offset + 1; + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Read some bytes from the current inode/file and send a write command to + * user space. + */ +static int send_write(struct send_ctx *sctx, u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + loff_t pos = offset; + int readed; + mm_segment_t old_fs; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + /* + * vfs normally only accepts user space buffers for security reasons. + * we only read from the file and also only provide the read_buf buffer + * to vfs. As this buffer does not come from a user space call, it's + * ok to temporary allow kernel space buffers. + */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + +verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); + + ret = open_cur_inode_file(sctx); + if (ret < 0) + goto out; + + ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); + if (ret < 0) + goto out; + readed = ret; + if (!readed) + goto out; + + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + set_fs(old_fs); + if (ret < 0) + return ret; + return readed; +} + +/* + * Send a clone command to user space. + */ +static int send_clone(struct send_ctx *sctx, + u64 offset, u32 len, + struct clone_root *clone_root) +{ + int ret = 0; + struct btrfs_root *clone_root2 = clone_root->root; + struct fs_path *p; + u64 gen; + +verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " + "clone_inode=%llu, clone_offset=%llu\n", offset, len, + clone_root->root->objectid, clone_root->ino, + clone_root->offset); + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + + if (clone_root2 == sctx->send_root) { + ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, + &gen, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, clone_root->ino, gen, p); + } else { + ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); + } + if (ret < 0) + goto out; + + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root2->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, + clone_root2->root_item.ctransid); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, + clone_root->offset); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + +static int send_write_or_clone(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key, + struct clone_root *clone_root) +{ + int ret = 0; + struct btrfs_file_extent_item *ei; + u64 offset = key->offset; + u64 pos = 0; + u64 len; + u32 l; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_INLINE) + len = btrfs_file_extent_inline_len(path->nodes[0], ei); + else + len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + + if (offset + len > sctx->cur_inode_size) + len = sctx->cur_inode_size - offset; + if (len == 0) { + ret = 0; + goto out; + } + + if (!clone_root) { + while (pos < len) { + l = len - pos; + if (l > BTRFS_SEND_READ_SIZE) + l = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, pos + offset, l); + if (ret < 0) + goto out; + if (!ret) + break; + pos += ret; + } + ret = 0; + } else { + ret = send_clone(sctx, offset, len, clone_root); + } + +out: + return ret; +} + +static int is_extent_unchanged(struct send_ctx *sctx, + struct btrfs_path *left_path, + struct btrfs_key *ekey) +{ + int ret = 0; + struct btrfs_key key; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + struct btrfs_key found_key; + struct btrfs_file_extent_item *ei; + u64 left_disknr; + u64 right_disknr; + u64 left_offset; + u64 right_offset; + u64 left_offset_fixed; + u64 left_len; + u64 right_len; + u8 left_type; + u8 right_type; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + eb = left_path->nodes[0]; + slot = left_path->slots[0]; + + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + left_type = btrfs_file_extent_type(eb, ei); + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + left_len = btrfs_file_extent_num_bytes(eb, ei); + left_offset = btrfs_file_extent_offset(eb, ei); + + if (left_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + + /* + * Following comments will refer to these graphics. L is the left + * extents which we are checking at the moment. 1-8 are the right + * extents that we iterate. + * + * |-----L-----| + * |-1-|-2a-|-3-|-4-|-5-|-6-| + * + * |-----L-----| + * |--1--|-2b-|...(same as above) + * + * Alternative situation. Happens on files where extents got split. + * |-----L-----| + * |-----------7-----------|-6-| + * + * Alternative situation. Happens on files which got larger. + * |-----L-----| + * |-8-| + * Nothing follows after 8. + */ + + key.objectid = ekey->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ekey->offset; + ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + /* + * Handle special case where the right side has no extents at all. + */ + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + /* + * We're now on 2a, 2b or 7. + */ + key = found_key; + while (key.offset < ekey->offset + left_len) { + ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + right_type = btrfs_file_extent_type(eb, ei); + right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + right_len = btrfs_file_extent_num_bytes(eb, ei); + right_offset = btrfs_file_extent_offset(eb, ei); + + if (right_type != BTRFS_FILE_EXTENT_REG) { + ret = 0; + goto out; + } + + /* + * Are we at extent 8? If yes, we know the extent is changed. + * This may only happen on the first iteration. + */ + if (found_key.offset + right_len < ekey->offset) { + ret = 0; + goto out; + } + + left_offset_fixed = left_offset; + if (key.offset < ekey->offset) { + /* Fix the right offset for 2a and 7. */ + right_offset += ekey->offset - key.offset; + } else { + /* Fix the left offset for all behind 2a and 2b */ + left_offset_fixed += key.offset - ekey->offset; + } + + /* + * Check if we have the same extent. + */ + if (left_disknr + left_offset_fixed != + right_disknr + right_offset) { + ret = 0; + goto out; + } + + /* + * Go to the next extent. + */ + ret = btrfs_next_item(sctx->parent_root, path); + if (ret < 0) + goto out; + if (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + key.offset += right_len; + break; + } else { + if (found_key.offset != key.offset + right_len) { + /* Should really not happen */ + ret = -EIO; + goto out; + } + } + key = found_key; + } + + /* + * We're now behind the left extent (treat as unchanged) or at the end + * of the right side (treat as changed). + */ + if (key.offset >= ekey->offset + left_len) + ret = 1; + else + ret = 0; + + +out: + btrfs_free_path(path); + return ret; +} + +static int process_extent(struct send_ctx *sctx, + struct btrfs_path *path, + struct btrfs_key *key) +{ + int ret = 0; + struct clone_root *found_clone = NULL; + + if (S_ISLNK(sctx->cur_inode_mode)) + return 0; + + if (sctx->parent_root && !sctx->cur_inode_new) { + ret = is_extent_unchanged(sctx, path, key); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + } + + ret = find_extent_clone(sctx, path, key->objectid, key->offset, + sctx->cur_inode_size, &found_clone); + if (ret != -ENOENT && ret < 0) + goto out; + + ret = send_write_or_clone(sctx, path, key, found_clone); + +out: + return ret; +} + +static int process_all_extents(struct send_ctx *sctx) +{ + int ret; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *eb; + int slot; + + root = sctx->send_root; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = sctx->cmp_key->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + while (1) { + ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + if (found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + ret = process_extent(sctx, path, &found_key); + if (ret < 0) + goto out; + + btrfs_release_path(path); + key.offset = found_key.offset + 1; + } + +out: + btrfs_free_path(path); + return ret; +} + +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) +{ + int ret = 0; + + if (sctx->cur_ino == 0) + goto out; + if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && + sctx->cmp_key->type <= BTRFS_INODE_REF_KEY) + goto out; + if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) + goto out; + + ret = process_recorded_refs(sctx); + +out: + return ret; +} + +static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) +{ + int ret = 0; + u64 left_mode; + u64 left_uid; + u64 left_gid; + u64 right_mode; + u64 right_uid; + u64 right_gid; + int need_chmod = 0; + int need_chown = 0; + + ret = process_recorded_refs_if_needed(sctx, at_end); + if (ret < 0) + goto out; + + if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) + goto out; + if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) + goto out; + + ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, + &left_mode, &left_uid, &left_gid); + if (ret < 0) + goto out; + + if (!S_ISLNK(sctx->cur_inode_mode)) { + if (!sctx->parent_root || sctx->cur_inode_new) { + need_chmod = 1; + need_chown = 1; + } else { + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, + NULL, NULL, &right_mode, &right_uid, + &right_gid); + if (ret < 0) + goto out; + + if (left_uid != right_uid || left_gid != right_gid) + need_chown = 1; + if (left_mode != right_mode) + need_chmod = 1; + } + } + + if (S_ISREG(sctx->cur_inode_mode)) { + ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_inode_size); + if (ret < 0) + goto out; + } + + if (need_chown) { + ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_uid, left_gid); + if (ret < 0) + goto out; + } + if (need_chmod) { + ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_mode); + if (ret < 0) + goto out; + } + + /* + * Need to send that every time, no matter if it actually changed + * between the two trees as we have done changes to the inode before. + */ + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; + +out: + return ret; +} + +static int changed_inode(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + struct btrfs_key *key = sctx->cmp_key; + struct btrfs_inode_item *left_ii = NULL; + struct btrfs_inode_item *right_ii = NULL; + u64 left_gen = 0; + u64 right_gen = 0; + + ret = close_cur_inode_file(sctx); + if (ret < 0) + goto out; + + sctx->cur_ino = key->objectid; + sctx->cur_inode_new_gen = 0; + sctx->cur_inode_first_ref_orphan = 0; + sctx->send_progress = sctx->cur_ino; + + if (result == BTRFS_COMPARE_TREE_NEW || + result == BTRFS_COMPARE_TREE_CHANGED) { + left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], + sctx->left_path->slots[0], + struct btrfs_inode_item); + left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], + left_ii); + } else { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + } + if (result == BTRFS_COMPARE_TREE_CHANGED) { + right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], + sctx->right_path->slots[0], + struct btrfs_inode_item); + + right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], + right_ii); + if (left_gen != right_gen) + sctx->cur_inode_new_gen = 1; + } + + if (result == BTRFS_COMPARE_TREE_NEW) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 1; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) + ret = send_create_inode(sctx, sctx->left_path, + sctx->cmp_key); + } else if (result == BTRFS_COMPARE_TREE_DELETED) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_deleted = 1; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + if (sctx->cur_inode_new_gen) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_deleted = 1; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 1; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + ret = send_create_inode(sctx, sctx->left_path, + sctx->cmp_key); + if (ret < 0) + goto out; + + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } else { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = 0; + sctx->cur_inode_new_gen = 0; + sctx->cur_inode_deleted = 0; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], left_ii); + } + } + +out: + return ret; +} + +static int changed_ref(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = record_new_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = record_deleted_ref(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = record_changed_ref(sctx); + } + + return ret; +} + +static int changed_xattr(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + ret = process_new_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_DELETED) + ret = process_deleted_xattr(sctx); + else if (result == BTRFS_COMPARE_TREE_CHANGED) + ret = process_changed_xattr(sctx); + } + + return ret; +} + +static int changed_extent(struct send_ctx *sctx, + enum btrfs_compare_tree_result result) +{ + int ret = 0; + + BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result != BTRFS_COMPARE_TREE_DELETED) + ret = process_extent(sctx, sctx->left_path, + sctx->cmp_key); + } + + return ret; +} + + +static int changed_cb(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + + sctx->left_path = left_path; + sctx->right_path = right_path; + sctx->cmp_key = key; + + ret = finish_inode_if_needed(sctx, 0); + if (ret < 0) + goto out; + + if (key->type == BTRFS_INODE_ITEM_KEY) + ret = changed_inode(sctx, result); + else if (key->type == BTRFS_INODE_REF_KEY) + ret = changed_ref(sctx, result); + else if (key->type == BTRFS_XATTR_ITEM_KEY) + ret = changed_xattr(sctx, result); + else if (key->type == BTRFS_EXTENT_DATA_KEY) + ret = changed_extent(sctx, result); + +out: + return ret; +} + +static int full_send_tree(struct send_ctx *sctx) +{ + int ret; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *send_root = sctx->send_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *eb; + int slot; + u64 start_ctransid; + u64 ctransid; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + spin_lock(&send_root->root_times_lock); + start_ctransid = btrfs_root_ctransid(&send_root->root_item); + spin_unlock(&send_root->root_times_lock); + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + +join_trans: + /* + * We need to make sure the transaction does not get committed + * while we do anything on commit roots. Join a transaction to prevent + * this. + */ + trans = btrfs_join_transaction(send_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + /* + * Make sure the tree has not changed + */ + spin_lock(&send_root->root_times_lock); + ctransid = btrfs_root_ctransid(&send_root->root_item); + spin_unlock(&send_root->root_times_lock); + + if (ctransid != start_ctransid) { + WARN(1, KERN_WARNING "btrfs: the root that you're trying to " + "send was modified in between. This is " + "probably a bug.\n"); + ret = -EIO; + goto out; + } + + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); + if (ret < 0) + goto out; + if (ret) + goto out_finish; + + while (1) { + /* + * When someone want to commit while we iterate, end the + * joined transaction and rejoin. + */ + if (btrfs_should_end_transaction(trans, send_root)) { + ret = btrfs_end_transaction(trans, send_root); + trans = NULL; + if (ret < 0) + goto out; + btrfs_release_path(path); + goto join_trans; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + + ret = changed_cb(send_root, NULL, path, NULL, + &found_key, BTRFS_COMPARE_TREE_NEW, sctx); + if (ret < 0) + goto out; + + key.objectid = found_key.objectid; + key.type = found_key.type; + key.offset = found_key.offset + 1; + + ret = btrfs_next_item(send_root, path); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + break; + } + } + +out_finish: + ret = finish_inode_if_needed(sctx, 1); + +out: + btrfs_free_path(path); + if (trans) { + if (!ret) + ret = btrfs_end_transaction(trans, send_root); + else + btrfs_end_transaction(trans, send_root); + } + return ret; +} + +static int send_subvol(struct send_ctx *sctx) +{ + int ret; + + ret = send_header(sctx); + if (ret < 0) + goto out; + + ret = send_subvol_begin(sctx); + if (ret < 0) + goto out; + + if (sctx->parent_root) { + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, + changed_cb, sctx); + if (ret < 0) + goto out; + ret = finish_inode_if_needed(sctx, 1); + if (ret < 0) + goto out; + } else { + ret = full_send_tree(sctx); + if (ret < 0) + goto out; + } + +out: + if (!ret) + ret = close_cur_inode_file(sctx); + else + close_cur_inode_file(sctx); + + free_recorded_refs(sctx); + return ret; +} + +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) +{ + int ret = 0; + struct btrfs_root *send_root; + struct btrfs_root *clone_root; + struct btrfs_fs_info *fs_info; + struct btrfs_ioctl_send_args *arg = NULL; + struct btrfs_key key; + struct file *filp = NULL; + struct send_ctx *sctx = NULL; + u32 i; + u64 *clone_sources_tmp = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root; + fs_info = send_root->fs_info; + + arg = memdup_user(arg_, sizeof(*arg)); + if (IS_ERR(arg)) { + ret = PTR_ERR(arg); + arg = NULL; + goto out; + } + + if (!access_ok(VERIFY_READ, arg->clone_sources, + sizeof(*arg->clone_sources * + arg->clone_sources_count))) { + ret = -EFAULT; + goto out; + } + + sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); + if (!sctx) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&sctx->new_refs); + INIT_LIST_HEAD(&sctx->deleted_refs); + INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); + INIT_LIST_HEAD(&sctx->name_cache_list); + + sctx->send_filp = fget(arg->send_fd); + if (IS_ERR(sctx->send_filp)) { + ret = PTR_ERR(sctx->send_filp); + goto out; + } + + sctx->mnt = mnt_file->f_path.mnt; + + sctx->send_root = send_root; + sctx->clone_roots_cnt = arg->clone_sources_count; + + sctx->send_max_size = BTRFS_SEND_BUF_SIZE; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } + + sctx->clone_roots = vzalloc(sizeof(struct clone_root) * + (arg->clone_sources_count + 1)); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } + + if (arg->clone_sources_count) { + clone_sources_tmp = vmalloc(arg->clone_sources_count * + sizeof(*arg->clone_sources)); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(clone_sources_tmp, arg->clone_sources, + arg->clone_sources_count * + sizeof(*arg->clone_sources)); + if (ret) { + ret = -EFAULT; + goto out; + } + + for (i = 0; i < arg->clone_sources_count; i++) { + key.objectid = clone_sources_tmp[i]; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (!clone_root) { + ret = -EINVAL; + goto out; + } + if (IS_ERR(clone_root)) { + ret = PTR_ERR(clone_root); + goto out; + } + sctx->clone_roots[i].root = clone_root; + } + vfree(clone_sources_tmp); + clone_sources_tmp = NULL; + } + + if (arg->parent_root) { + key.objectid = arg->parent_root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (!sctx->parent_root) { + ret = -EINVAL; + goto out; + } + } + + /* + * Clones from send_root are allowed, but only if the clone source + * is behind the current send position. This is checked while searching + * for possible clone sources. + */ + sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + + /* We do a bsearch later */ + sort(sctx->clone_roots, sctx->clone_roots_cnt, + sizeof(*sctx->clone_roots), __clone_root_cmp_sort, + NULL); + + ret = send_subvol(sctx); + if (ret < 0) + goto out; + + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) + goto out; + ret = send_cmd(sctx); + if (ret < 0) + goto out; + +out: + if (filp) + fput(filp); + kfree(arg); + vfree(clone_sources_tmp); + + if (sctx) { + if (sctx->send_filp) + fput(sctx->send_filp); + + vfree(sctx->clone_roots); + vfree(sctx->send_buf); + vfree(sctx->read_buf); + + name_cache_free(sctx); + + kfree(sctx); + } + + return ret; +} diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h new file mode 100644 index 00000000000..9934e948e57 --- /dev/null +++ b/fs/btrfs/send.h @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2012 Alexander Block. All rights reserved. + * Copyright (C) 2012 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" + +#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +#define BTRFS_SEND_STREAM_VERSION 1 + +#define BTRFS_SEND_BUF_SIZE (1024 * 64) +#define BTRFS_SEND_READ_SIZE (1024 * 48) + +enum btrfs_tlv_type { + BTRFS_TLV_U8, + BTRFS_TLV_U16, + BTRFS_TLV_U32, + BTRFS_TLV_U64, + BTRFS_TLV_BINARY, + BTRFS_TLV_STRING, + BTRFS_TLV_UUID, + BTRFS_TLV_TIMESPEC, +}; + +struct btrfs_stream_header { + char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)]; + __le32 version; +} __attribute__ ((__packed__)); + +struct btrfs_cmd_header { + /* len excluding the header */ + __le32 len; + __le16 cmd; + /* crc including the header with zero crc field */ + __le32 crc; +} __attribute__ ((__packed__)); + +struct btrfs_tlv_header { + __le16 tlv_type; + /* len excluding the header */ + __le16 tlv_len; +} __attribute__ ((__packed__)); + +/* commands */ +enum btrfs_send_cmd { + BTRFS_SEND_C_UNSPEC, + + BTRFS_SEND_C_SUBVOL, + BTRFS_SEND_C_SNAPSHOT, + + BTRFS_SEND_C_MKFILE, + BTRFS_SEND_C_MKDIR, + BTRFS_SEND_C_MKNOD, + BTRFS_SEND_C_MKFIFO, + BTRFS_SEND_C_MKSOCK, + BTRFS_SEND_C_SYMLINK, + + BTRFS_SEND_C_RENAME, + BTRFS_SEND_C_LINK, + BTRFS_SEND_C_UNLINK, + BTRFS_SEND_C_RMDIR, + + BTRFS_SEND_C_SET_XATTR, + BTRFS_SEND_C_REMOVE_XATTR, + + BTRFS_SEND_C_WRITE, + BTRFS_SEND_C_CLONE, + + BTRFS_SEND_C_TRUNCATE, + BTRFS_SEND_C_CHMOD, + BTRFS_SEND_C_CHOWN, + BTRFS_SEND_C_UTIMES, + + BTRFS_SEND_C_END, + __BTRFS_SEND_C_MAX, +}; +#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) + +/* attributes in send stream */ +enum { + BTRFS_SEND_A_UNSPEC, + + BTRFS_SEND_A_UUID, + BTRFS_SEND_A_CTRANSID, + + BTRFS_SEND_A_INO, + BTRFS_SEND_A_SIZE, + BTRFS_SEND_A_MODE, + BTRFS_SEND_A_UID, + BTRFS_SEND_A_GID, + BTRFS_SEND_A_RDEV, + BTRFS_SEND_A_CTIME, + BTRFS_SEND_A_MTIME, + BTRFS_SEND_A_ATIME, + BTRFS_SEND_A_OTIME, + + BTRFS_SEND_A_XATTR_NAME, + BTRFS_SEND_A_XATTR_DATA, + + BTRFS_SEND_A_PATH, + BTRFS_SEND_A_PATH_TO, + BTRFS_SEND_A_PATH_LINK, + + BTRFS_SEND_A_FILE_OFFSET, + BTRFS_SEND_A_DATA, + + BTRFS_SEND_A_CLONE_UUID, + BTRFS_SEND_A_CLONE_CTRANSID, + BTRFS_SEND_A_CLONE_PATH, + BTRFS_SEND_A_CLONE_OFFSET, + BTRFS_SEND_A_CLONE_LEN, + + __BTRFS_SEND_A_MAX, +}; +#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) + +#ifdef __KERNEL__ +long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +#endif -- cgit v1.2.3 From b24baf6917a376420d535548e1f88744028bcf24 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jul 2012 19:21:10 -0400 Subject: Btrfs: uninit variable fixes in send/receive Signed-off-by: Chris Mason --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5394cb75012..bf232c88a0b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2676,7 +2676,7 @@ static int process_recorded_refs(struct send_ctx *sctx) struct ulist_iterator uit; struct ulist_node *un; struct fs_path *valid_path = NULL; - u64 ow_inode; + u64 ow_inode = 0; u64 ow_gen; int did_overwrite = 0; int is_orphan = 0; @@ -3553,7 +3553,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int ret = 0; struct fs_path *p; loff_t pos = offset; - int readed; + int readed = 0; mm_segment_t old_fs; p = fs_path_alloc(sctx); -- cgit v1.2.3 From a1857ebe752d77d96c89d964500a9528df6d320e Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 27 Jul 2012 10:11:13 +1000 Subject: Btrfs: using vmalloc and friends needs vmalloc.h On powerpc, we don't get the implicit vmalloc.h include, and as a result the build fails noisily: fs/btrfs/send.c: In function 'fs_path_free': fs/btrfs/send.c:185:4: error: implicit declaration of function 'vfree' [-Werror=implicit-function-declaration] fs/btrfs/send.c: In function 'fs_path_ensure_buf': fs/btrfs/send.c:215:4: error: implicit declaration of function 'vmalloc' [-Werror=implicit-function-declaration] fs/btrfs/send.c:215:12: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c:225:12: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c:233:13: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c: In function 'iterate_dir_item': fs/btrfs/send.c:900:10: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c:909:11: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c: In function 'btrfs_ioctl_send': fs/btrfs/send.c:4463:17: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c:4469:17: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c:4475:2: error: implicit declaration of function 'vzalloc' [-Werror=implicit-function-declaration] fs/btrfs/send.c:4475:20: warning: assignment makes pointer from integer without a cast [enabled by default] fs/btrfs/send.c:4483:21: warning: assignment makes pointer from integer without a cast [enabled by default] Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- fs/btrfs/send.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c8ca49b1bb4..fb5ffe95f86 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "send.h" #include "backref.h" -- cgit v1.2.3 From e7848683ae7ded0a4a8964122a47da9104a98337 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:32 +0200 Subject: btrfs: Push mnt_want_write() outside of i_mutex When mnt_want_write() starts to handle freezing it will get a full lock semantics requiring proper lock ordering. So push mnt_want_write() call consistently outside of i_mutex. CC: Chris Mason CC: linux-btrfs@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1e9f6c019ad..cd93eb530b7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -193,6 +193,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(file); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); ip_oldflags = ip->flags; @@ -207,10 +211,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write_file(file); - if (ret) - goto out_unlock; - if (flags & FS_SYNC_FL) ip->flags |= BTRFS_INODE_SYNC; else @@ -273,9 +273,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode->i_flags = i_oldflags; } - mnt_drop_write_file(file); out_unlock: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); return ret; } @@ -641,6 +641,10 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; + error = mnt_want_write(parent->mnt); + if (error) + return error; + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, parent->dentry, namelen); @@ -652,13 +656,9 @@ static noinline int btrfs_mksubvol(struct path *parent, if (dentry->d_inode) goto out_dput; - error = mnt_want_write(parent->mnt); - if (error) - goto out_dput; - error = btrfs_may_create(dir, dentry); if (error) - goto out_drop_write; + goto out_dput; down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); @@ -676,12 +676,11 @@ static noinline int btrfs_mksubvol(struct path *parent, fsnotify_mkdir(dir, dentry); out_up_read: up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); -out_drop_write: - mnt_drop_write(parent->mnt); out_dput: dput(dentry); out_unlock: mutex_unlock(&dir->i_mutex); + mnt_drop_write(parent->mnt); return error; } -- cgit v1.2.3 From 533574c6bc30cf526cc1c41bde050c854a945efb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 30 Jul 2012 14:40:13 -0700 Subject: btrfs: use printk_get_level and printk_skip_level, add __printf, fix fallout Use the generic printk_get_level() to search a message for a kern_level. Add __printf to verify format and arguments. Fix a few messages that had mismatches in format and arguments. Add #ifdef CONFIG_PRINTK blocks to shrink the object size a bit when not using printk. [akpm@linux-foundation.org: whitespace tweak] Signed-off-by: Joe Perches Cc: Kay Sievers Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/ctree.h | 13 +++++++++++++ fs/btrfs/disk-io.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/super.c | 41 +++++++++++++++++++++++++++++++++++------ 4 files changed, 50 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index adb1cd7ceb9..4bab807227a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3342,10 +3342,22 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); + +#ifdef CONFIG_PRINTK +__printf(2, 3) void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); +#else +static inline __printf(2, 3) +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} +#endif + +__printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); + void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -3386,6 +3398,7 @@ do { \ (errno), fmt, ##args); \ } while (0) +__printf(5, 6) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 502b20c56e8..fadeba6a5db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1114,7 +1114,7 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, spin_unlock(&root->fs_info->delalloc_lock); btrfs_panic(root->fs_info, -EOVERFLOW, "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%lu)", + " dirty_mdatadata_bytes (%llu)", buf->len, root->fs_info->dirty_metadata_bytes); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c5dbd914967..4da08652004 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1241,7 +1241,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (rb_node) { btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " - "tree\n"); + "tree\n", node->bytenr); kfree(node); return -EEXIST; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fa61ef59cd6..8c6e61d6eed 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -125,6 +125,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } } +#ifdef CONFIG_PRINTK /* * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. @@ -167,7 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, va_end(args); } -const char *logtypes[] = { +static const char * const logtypes[] = { "emergency", "alert", "critical", @@ -185,22 +186,50 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) struct va_format vaf; va_list args; const char *type = logtypes[4]; + int kern_level; va_start(args, fmt); - if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { - memcpy(lvl, fmt, 3); - lvl[3] = '\0'; - fmt += 3; - type = logtypes[fmt[1] - '0']; + kern_level = printk_get_level(fmt); + if (kern_level) { + size_t size = printk_skip_level(fmt) - fmt; + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + fmt += size; + type = logtypes[kern_level - '0']; } else *lvl = '\0'; vaf.fmt = fmt; vaf.va = &args; + printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); + + va_end(args); } +#else + +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + /* Don't go through full error handling during mount */ + if (sb->s_flags & MS_BORN) { + save_error_info(fs_info); + btrfs_handle_error(fs_info); + } +} +#endif + /* * We only mark the transaction aborted and then set the file system read-only. * This will prevent new transactions from starting or trying to join this -- cgit v1.2.3 From b2b5ef5c8e89f19b68c174bf246f3ca212dbf0bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:45 +0200 Subject: btrfs: Convert to new freezing mechanism We convert btrfs_file_aio_write() to use new freeze check. We also add proper freeze protection to btrfs_page_mkwrite(). We also add freeze protection to the transaction mechanism to avoid starting transactions on frozen filesystem. At minimum this is necessary to stop iput() of unlinked file to change frozen filesystem during truncation. Checks in cleaner_kthread() and transaction_kthread() can be safely removed since btrfs_freeze() will lock the mutexes and thus block the threads (and they shouldn't have anything to do anyway). CC: linux-btrfs@vger.kernel.org CC: Chris Mason Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 6 +++++- fs/btrfs/transaction.c | 7 +++++++ 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2936ca49b3b..20c49b16b75 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1533,8 +1533,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { btrfs_run_delayed_iputs(root); @@ -1566,7 +1564,6 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9aa01ec2138..5caf285c6e4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t err = 0; size_t count, ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); @@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb8d671d00e..f4d901786b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6620,6 +6620,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + sb_start_pagefault(inode->i_sb); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); @@ -6709,12 +6710,15 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - if (!ret) + if (!ret) { + sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; + } unlock_page(page); out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out_noreserve: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b72b068183e..fa67ba51516 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -329,6 +329,8 @@ again: if (!h) return ERR_PTR(-ENOMEM); + sb_start_intwrite(root->fs_info->sb); + if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -339,6 +341,7 @@ again: } while (ret == -EBUSY); if (ret < 0) { + sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(ret); } @@ -528,6 +531,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } + sb_end_intwrite(root->fs_info->sb); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1517,6 +1522,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + sb_end_intwrite(root->fs_info->sb); + trace_btrfs_transaction_commit(root); btrfs_scrub_continue(root); -- cgit v1.2.3 From 34eaadaf22b0dd453288c6b115e0c823a0fb74d5 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 25 Jul 2012 18:12:05 +0300 Subject: btrfs: nuke write_super from comments The '->write_super' superblock method is gone, and this patch removes all the references to 'write_super' from btrfs. Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/btrfs/super.c | 4 ---- fs/btrfs/volumes.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8c6e61d6eed..f2eb24c477a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -100,10 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; } -/* NOTE: - * We move write_super stuff at umount in order to avoid deadlock - * for umount hold all lock. - */ static void save_error_info(struct btrfs_fs_info *fs_info) { __save_error_info(fs_info); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8708f994e6..e86ae04abe6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1744,10 +1744,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->fs_devices = root->fs_info->fs_devices; - /* - * we don't want write_supers to jump in here with our device - * half setup - */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, -- cgit v1.2.3 From b257031408945eb89980e14cb79d5fd854d8f25f Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 25 Jul 2012 18:12:06 +0300 Subject: btrfs: nuke pdflush from comments The pdflush thread is long gone, so this patch removes references to pdflush from btrfs comments. Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/btrfs/inode.c | 3 ++- fs/btrfs/ordered-data.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83baec24946..6e8f416773d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -324,7 +324,8 @@ static noinline int add_async_extent(struct async_cow *cow, * If this code finds it can't get good compression, it puts an * entry onto the work queue to write the uncompressed bytes. This * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that pdflush sent them down. + * are written in the same order that the flusher thread sent them + * down. */ static noinline int compress_file_range(struct inode *inode, struct page *locked_page, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 643335a4fe3..051c7fe551d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -596,7 +596,7 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting - * for pdflush to find them + * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->i_mapping, start, end); -- cgit v1.2.3 From e00da2067b78a9246f767012a3803224c40b1f9f Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Thu, 2 Aug 2012 17:16:20 -0600 Subject: Btrfs: remove mnt_want_write call in btrfs_mksubvol We got a recursive lock in mksubvol because the caller already held a lock. I think we got into this due to a merge error. Commit a874a63 removed the mnt_want_write call from btrfs_mksubvol and added a replacement call to mnt_want_write_file in btrfs_ioctl_snap_create_transid. Commit e7848683 however tried to move all calls to mnt_want_write above i_mutex. So somewhere while merging this, it got mixed up. The solution is to remove the mnt_want_write call completely from mksubvol. Reported-by: David Sterba Signed-off-by: Alexander Block Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bc2f6ffff3c..7bb755677a2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -664,10 +664,6 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - error = mnt_want_write(parent->mnt); - if (error) - return error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, parent->dentry, namelen); @@ -703,7 +699,6 @@ out_dput: dput(dentry); out_unlock: mutex_unlock(&dir->i_mutex); - mnt_drop_write(parent->mnt); return error; } -- cgit v1.2.3 From aa2ffd06168e25689e0eb9662bf4595ba2bbac14 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 26 Jul 2012 03:40:35 -0600 Subject: Btrfs: fix a misplaced address operator in a condition This should obviously not be "if (&flag)" but "if (flag)". Signed-off-by: Stefan Behrens --- fs/btrfs/locking.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a44eff07480..2a1762c6604 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -67,7 +67,7 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { if (eb->lock_nested) { read_lock(&eb->lock); - if (&eb->lock_nested && current->pid == eb->lock_owner) { + if (eb->lock_nested && current->pid == eb->lock_owner) { read_unlock(&eb->lock); return; } -- cgit v1.2.3 From 5986802c2fcc754040bb7ed95f30bb16c4a843b7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jul 2012 02:16:10 -0600 Subject: Btrfs: fix some error codes in btrfs_qgroup_inherit() These are returning zero when it should be returning a negative error code. Signed-off-by: Dan Carpenter --- fs/btrfs/qgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index bc424ae5a81..229ef8927e6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1369,8 +1369,10 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); - if (!srcgroup) + if (!srcgroup) { + ret = -EINVAL; goto unlock; + } dstgroup->rfer = srcgroup->rfer - level_size; dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; srcgroup->excl = level_size; @@ -1379,8 +1381,10 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, qgroup_dirty(fs_info, srcgroup); } - if (!inherit) + if (!inherit) { + ret = -EINVAL; goto unlock; + } i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { -- cgit v1.2.3 From 57a5a882031dba5cb7bc7ebc955b897498365fe2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jul 2012 02:15:43 -0600 Subject: Btrfs: checking for NULL instead of IS_ERR add_qgroup_rb() never returns NULL, only error pointers. Signed-off-by: Dan Carpenter --- fs/btrfs/qgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 229ef8927e6..38b42e7bc91 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1364,8 +1364,10 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); dstgroup = add_qgroup_rb(fs_info, objectid); - if (!dstgroup) + if (IS_ERR(dstgroup)) { + ret = PTR_ERR(dstgroup); goto unlock; + } if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); -- cgit v1.2.3 From 55e591ffde38e0088b022129e035e18a8d04c7e6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jul 2012 02:15:15 -0600 Subject: Btrfs: unlock on error in btrfs_delalloc_reserve_metadata() We should release this mutex before returning the error code. Signed-off-by: Dan Carpenter --- fs/btrfs/extent-tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4e1b153b7c4..45c69c4184c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4571,8 +4571,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); - if (ret) + if (ret) { + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; + } } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); -- cgit v1.2.3 From dadd1105ca9a1e506c678e8e410e9623efdda821 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jul 2012 02:10:44 -0600 Subject: Btrfs: fix some endian bugs handling the root times "trans->transid" is cpu endian but we want to store the data as little endian. "item->ctime.nsec" is only 32 bits, not 64. Signed-off-by: Dan Carpenter --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/root-tree.c | 4 ++-- fs/btrfs/transaction.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 43f0012016e..a1fbca0a100 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -424,7 +424,7 @@ static noinline int create_subvol(struct btrfs_root *root, uuid_le_gen(&new_uuid); memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); root_item.otime.sec = cpu_to_le64(cur_time.tv_sec); - root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec); + root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec); root_item.ctime = root_item.otime; btrfs_set_root_ctransid(&root_item, trans->transid); btrfs_set_root_otransid(&root_item, trans->transid); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6bb465cca20..10d8e4d8807 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -544,8 +544,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct timespec ct = CURRENT_TIME; spin_lock(&root->root_times_lock); - item->ctransid = trans->transid; + item->ctransid = cpu_to_le64(trans->transid); item->ctime.sec = cpu_to_le64(ct.tv_sec); - item->ctime.nsec = cpu_to_le64(ct.tv_nsec); + item->ctime.nsec = cpu_to_le32(ct.tv_nsec); spin_unlock(&root->root_times_lock); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7ac7cdcc294..7208ada41e0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1061,7 +1061,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); - new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec); + new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); -- cgit v1.2.3 From eb838e73dc2121d2bae47d5678952cd7d48793b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 Jul 2012 16:28:48 -0400 Subject: Btrfs: lock extents as we map them in DIO A deadlock in xfstests 113 was uncovered by commit d187663ef24cd3d033f0cbf2867e70b36a3a90b8 This is because we would not return EIOCBQUEUED for short AIO reads, instead we'd wait for the DIO to complete and then return the amount of data we transferred, which would allow our stuff to unlock the remaning amount. But with this change this no longer happens, so if we have a short AIO read (for example if we try to read past EOF), we could leave the section from EOF to the end of where we tried to read locked. Fixing this is tricky since there is no clear way to know exactly how much data DIO truly submitted for IO, so to make this less hard on ourselves and less combersome we need to lock the extents as we try to map them, and then we unlock any areas we didn't actually map. This makes us completely safe from deadlocks and reliance on a particular behavior of the DIO code. This also lays the groundwork for allowing us to use the normal csum storage method for reads which means we can remove an allocation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 256 +++++++++++++++++++++++++++---------------------------- 1 file changed, 127 insertions(+), 129 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dac1fc21d80..09182449cbd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5773,18 +5773,109 @@ out: return ret; } +static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, + struct extent_state **cached_state, int writing) +{ + struct btrfs_ordered_extent *ordered; + int ret = 0; + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, cached_state); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && (!writing || + !test_range_bit(&BTRFS_I(inode)->io_tree, + lockstart, lockend, EXTENT_UPTODATE, 0, + *cached_state))) + break; + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state, GFP_NOFS); + + if (ordered) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } else { + /* Screw you mmap */ + ret = filemap_write_and_wait_range(inode->i_mapping, + lockstart, + lockend); + if (ret) + break; + + /* + * If we found a page that couldn't be invalidated just + * fall back to buffered. + */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + lockstart >> PAGE_CACHE_SHIFT, + lockend >> PAGE_CACHE_SHIFT); + if (ret) + break; + } + + cond_resched(); + } + + return ret; +} + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; u64 start = iblock << inode->i_blkbits; + u64 lockstart, lockend; u64 len = bh_result->b_size; struct btrfs_trans_handle *trans; + int unlock_bits = EXTENT_LOCKED; + int ret; + + lockstart = start; + lockend = start + len - 1; + if (create) { + ret = btrfs_delalloc_reserve_space(inode, len); + if (ret) + return ret; + unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + } + + /* + * If this errors out it's because we couldn't invalidate pagecache for + * this range and we need to fallback to buffered. + */ + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) + return -ENOTBLK; + + if (create) { + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + if (ret) + goto unlock_err; + } em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - if (IS_ERR(em)) - return PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } /* * Ok for INLINE and COMPRESSED extents we need to fallback on buffered @@ -5803,17 +5894,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - return -ENOTBLK; + ret = -ENOTBLK; + goto unlock_err; } /* Just a good old fashioned hole, return */ if (!create && (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { free_extent_map(em); - /* DIO will do one hole at a time, so just unlock a sector */ - unlock_extent(&BTRFS_I(inode)->io_tree, start, - start + root->sectorsize - 1); - return 0; + ret = 0; + goto unlock_err; } /* @@ -5826,8 +5916,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * */ if (!create) { - len = em->len - (start - em->start); - goto map; + len = min(len, em->len - (start - em->start)); + lockstart = start + len; + goto unlock; } if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || @@ -5859,7 +5950,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, btrfs_end_transaction(trans, root); if (ret) { free_extent_map(em); - return ret; + goto unlock_err; } goto unlock; } @@ -5872,14 +5963,12 @@ must_cow: */ len = bh_result->b_size; em = btrfs_new_extent_direct(inode, em, start, len); - if (IS_ERR(em)) - return PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto unlock_err; + } len = min(len, em->len - (start - em->start)); unlock: - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, - EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, - 0, NULL, GFP_NOFS); -map: bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; bh_result->b_size = len; @@ -5897,9 +5986,28 @@ map: i_size_write(inode, start + len); } + /* + * In the case of write we need to clear and unlock the entire range, + * in the case of read we need to unlock only the end area that we + * aren't using if there is any left over space. + */ + if (lockstart < lockend) + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, 1, 0, &cached_state, GFP_NOFS); + else + free_extent_state(cached_state); + free_extent_map(em); return 0; + +unlock_err: + if (create) + unlock_bits |= EXTENT_DO_ACCOUNTING; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, 1, 0, &cached_state, GFP_NOFS); + return ret; } struct btrfs_dio_private { @@ -6340,132 +6448,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io out: return retval; } + static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - u64 lockstart, lockend; - ssize_t ret; - int writing = rw & WRITE; - int write_bits = 0; - size_t count = iov_length(iov, nr_segs); if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, - offset, nr_segs)) { + offset, nr_segs)) return 0; - } - - lockstart = offset; - lockend = offset + count - 1; - - if (writing) { - ret = btrfs_delalloc_reserve_space(inode, count); - if (ret) - goto out; - } - while (1) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state); - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(inode, lockstart, - lockend - lockstart + 1); - - /* - * We need to make sure there are no buffered pages in this - * range either, we could have raced between the invalidate in - * generic_file_direct_write and locking the extent. The - * invalidate needs to happen so that reads after a write do not - * get stale data. - */ - if (!ordered && (!writing || - !test_range_bit(&BTRFS_I(inode)->io_tree, - lockstart, lockend, EXTENT_UPTODATE, 0, - cached_state))) - break; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); - - if (ordered) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } else { - /* Screw you mmap */ - ret = filemap_write_and_wait_range(file->f_mapping, - lockstart, - lockend); - if (ret) - goto out; - - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. - */ - ret = invalidate_inode_pages2_range(file->f_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) { - if (ret == -EBUSY) - ret = 0; - goto out; - } - } - - cond_resched(); - } - - /* - * we don't use btrfs_set_extent_delalloc because we don't want - * the dirty or uptodate bits - */ - if (writing) { - write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DELALLOC, NULL, &cached_state, - GFP_NOFS); - if (ret) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_LOCKED | write_bits, - 1, 0, &cached_state, GFP_NOFS); - goto out; - } - } - - free_extent_state(cached_state); - cached_state = NULL; - - ret = __blockdev_direct_IO(rw, iocb, inode, + return __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, 0); - - if (ret < 0 && ret != -EIOCBQUEUED) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, - offset + iov_length(iov, nr_segs) - 1, - EXTENT_LOCKED | write_bits, 1, 0, - &cached_state, GFP_NOFS); - } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { - /* - * We're falling back to buffered, unlock the section we didn't - * do IO on. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, - offset + iov_length(iov, nr_segs) - 1, - EXTENT_LOCKED | write_bits, 1, 0, - &cached_state, GFP_NOFS); - } -out: - free_extent_state(cached_state); - return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v1.2.3 From 3627bf4503b504077332c13496cb1bd54713bcbb Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 1 Aug 2012 04:28:01 -0600 Subject: Btrfs: fix that error value is changed by mistake In iterate_inodes_from_logical() the error result from extent_from_logical() is patched by mistake. Typically ENOENT is patched to EINVAL because (-ENOENT & BTRFS_EXTENT_FLAG_TREE_BLOCK) evaluates to true. Signed-off-by: Stefan Behrens --- fs/btrfs/backref.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a256f3b2a84..ff6475f409d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1438,10 +1438,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, ret = extent_from_logical(fs_info, logical, path, &found_key); btrfs_release_path(path); - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = -EINVAL; if (ret < 0) return ret; + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return -EINVAL; extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, -- cgit v1.2.3 From aa9ddcd4b5557102fa25695c11904f249b4dec49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 2 Aug 2012 10:22:20 -0400 Subject: Btrfs: do not use missing devices when showing devname If you do the following mkfs.btrfs /dev/sdb /dev/sdc rmmod btrfs dd if=/dev/zero of=/dev/sdb bs=1M count=1 mount -o degraded /dev/sdc /mnt/btrfs-test the box will panic trying to deref the name for the missing dev since it is the lower numbered devid. So fix show_devname to not use missing devices. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 75ee2c7791f..2e06f124f28 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1505,6 +1505,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) while (cur_devices) { head = &cur_devices->devices; list_for_each_entry(dev, head, dev_list) { + if (dev->missing) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } -- cgit v1.2.3 From 99f5944b8477914406173b47b4f261356286730b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 2 Aug 2012 10:23:59 -0400 Subject: Btrfs: do not strdup non existent strings When we close devices we add back empty devices for some reason that escapes me. In the case of a missing dev we don't allocate an rcu_string for it's name, so check to see if the device has a name and if it doesn't don't bother strdup()'ing it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8708f994e6..3b394503bd4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -569,9 +569,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) memcpy(new_device, device, sizeof(*new_device)); /* Safe because we are under uuid_mutex */ - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(device->name && !name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(device->name && !name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; -- cgit v1.2.3 From c329861da40623cd838b8c9ee31a850242fd88cf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 3 Aug 2012 16:49:19 -0400 Subject: Btrfs: don't allocate a seperate csums array for direct reads We've been allocating a big array for csums instead of storing them in the io_tree like we do for buffered reads because previously we were locking the entire range, so we didn't have an extent state for each sector of the range. But now that we do the range locking as we map the buffers we can limit the mapping lenght to sectorsize and use the private part of the io_tree for our csums. This allows us to avoid an extra memory allocation for direct reads which could incur latency. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/inode.c | 45 ++++++++++++++++----------------------------- 3 files changed, 19 insertions(+), 32 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index adb1cd7ceb9..348196350bf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3192,7 +3192,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 logical_offset, u32 *dst); + struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b45b9de0c21..857d93cd01d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -272,9 +272,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, } int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 offset, u32 *dst) + struct bio *bio, u64 offset) { - return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); + return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09182449cbd..2d65c52b094 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5847,15 +5847,18 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, int unlock_bits = EXTENT_LOCKED; int ret; - lockstart = start; - lockend = start + len - 1; if (create) { ret = btrfs_delalloc_reserve_space(inode, len); if (ret) return ret; unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + } else { + len = min_t(u64, len, root->sectorsize); } + lockstart = start; + lockend = start + len - 1; + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. @@ -6015,7 +6018,6 @@ struct btrfs_dio_private { u64 logical_offset; u64 disk_bytenr; u64 bytes; - u32 *csums; void *private; /* number of bios pending for this dio */ @@ -6035,7 +6037,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; u64 start; - u32 *private = dip->csums; start = dip->logical_offset; do { @@ -6043,8 +6044,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct page *page = bvec->bv_page; char *kaddr; u32 csum = ~(u32)0; + u64 private = ~(u32)0; unsigned long flags; + if (get_state_private(&BTRFS_I(inode)->io_tree, + start, &private)) + goto failed; local_irq_save(flags); kaddr = kmap_atomic(page); csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, @@ -6054,18 +6059,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != *private) { + if (csum != private) { +failed: printk(KERN_ERR "btrfs csum failed ino %llu off" " %llu csum %u private %u\n", (unsigned long long)btrfs_ino(inode), (unsigned long long)start, - csum, *private); + csum, (unsigned)private); err = -EIO; } } start += bvec->bv_len; - private++; bvec++; } while (bvec <= bvec_end); @@ -6073,7 +6078,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) dip->logical_offset + dip->bytes - 1); bio->bi_private = dip->private; - kfree(dip->csums); kfree(dip); /* If we had a csum failure make sure to clear the uptodate flag */ @@ -6179,7 +6183,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, - u32 *csums, int async_submit) + int async_submit) { int write = rw & REQ_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -6212,8 +6216,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (ret) goto err; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, bio, - file_offset, csums); + ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset); if (ret) goto err; } @@ -6239,10 +6242,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, u64 submit_len = 0; u64 map_length; int nr_pages = 0; - u32 *csums = dip->csums; int ret = 0; int async_submit = 0; - int write = rw & REQ_WRITE; map_length = orig_bio->bi_size; ret = btrfs_map_block(map_tree, READ, start_sector << 9, @@ -6278,16 +6279,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, atomic_inc(&dip->pending_bios); ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums, async_submit); + async_submit); if (ret) { bio_put(bio); atomic_dec(&dip->pending_bios); goto out_err; } - /* Write's use the ordered csums */ - if (!write && !skip_sum) - csums = csums + nr_pages; start_sector += submit_len >> 9; file_offset += submit_len; @@ -6317,7 +6315,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, submit: ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums, async_submit); + async_submit); if (!ret) return 0; @@ -6353,17 +6351,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, ret = -ENOMEM; goto free_ordered; } - dip->csums = NULL; - - /* Write's use the ordered csum stuff, so we don't need dip->csums */ - if (!write && !skip_sum) { - dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); - if (!dip->csums) { - kfree(dip); - ret = -ENOMEM; - goto free_ordered; - } - } dip->private = bio->bi_private; dip->inode = inode; -- cgit v1.2.3 From 6209526531e70c080f79318ab8f50e26846c40a8 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Sat, 4 Aug 2012 01:45:02 -0600 Subject: btrfs: fix second lock in btrfs_delete_delayed_items() Fix a real bug caught by coccinelle. fs/btrfs/delayed-inode.c:1013:1-11: second lock on line 1013 Signed-off-by: Fengguang Wu --- fs/btrfs/delayed-inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 335605c8cea..00deed4ef3e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1028,9 +1028,10 @@ do_again: btrfs_release_delayed_item(prev); ret = 0; btrfs_release_path(path); - if (curr) + if (curr) { + mutex_unlock(&node->mutex); goto do_again; - else + } else goto delete_fail; } -- cgit v1.2.3 From 1fa11e265fa2562fb713171b6a58e72bb7afd276 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 6 Aug 2012 14:18:51 -0600 Subject: Btrfs: fix deadlock in wait_for_more_refs Commit a168650c introduced a waiting mechanism to prevent busy waiting in btrfs_run_delayed_refs. This can deadlock with btrfs_run_ordered_operations, where a tree_mod_seq is held while waiting for the io to complete, while the end_io calls btrfs_run_delayed_refs. This whole mechanism is unnecessary. If not enough runnable refs are available to satisfy count, just return as count is more like a guideline than a strict requirement. In case we have to run all refs, commit transaction makes sure that no other threads are working in the transaction anymore, so we just assert here that no refs are blocked. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 6 ---- fs/btrfs/ctree.h | 1 - fs/btrfs/delayed-ref.c | 8 ------ fs/btrfs/disk-io.c | 2 -- fs/btrfs/extent-tree.c | 77 ++++++++++++++------------------------------------ 5 files changed, 21 insertions(+), 73 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9d7621f271f..08e0b11ba0a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -420,12 +420,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, } spin_unlock(&fs_info->tree_mod_seq_lock); - /* - * we removed the lowest blocker from the blocker list, so there may be - * more processible delayed refs. - */ - wake_up(&fs_info->tree_mod_seq_wait); - /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 348196350bf..c38734a07a6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1252,7 +1252,6 @@ struct btrfs_fs_info { atomic_t tree_mod_seq; struct list_head tree_mod_seq_list; struct seq_list tree_mod_seq_elem; - wait_queue_head_t tree_mod_seq_wait; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index da7419ed01b..7561431af50 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -662,9 +662,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && - waitqueue_active(&fs_info->tree_mod_seq_wait)) - wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); if (need_ref_seq(for_cow, ref_root)) btrfs_qgroup_record_ref(trans, &ref->node, extent_op); @@ -713,9 +710,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && - waitqueue_active(&fs_info->tree_mod_seq_wait)) - wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); if (need_ref_seq(for_cow, ref_root)) btrfs_qgroup_record_ref(trans, &ref->node, extent_op); @@ -744,8 +738,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - if (waitqueue_active(&fs_info->tree_mod_seq_wait)) - wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 502b20c56e8..a7ad8fc8dc5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2035,8 +2035,6 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; - init_waitqueue_head(&fs_info->tree_mod_seq_wait); - /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45c69c4184c..d3df65f83b5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2318,12 +2318,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - /* - * we modified num_entries, but as we're currently running - * delayed refs, skip - * wake_up(&delayed_refs->seq_wait); - * here. - */ spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, @@ -2350,22 +2344,6 @@ next: return count; } -static void wait_for_more_refs(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs, - struct list_head *first_seq) -{ - spin_unlock(&delayed_refs->lock); - pr_debug("waiting for more refs (num %ld, first %p)\n", - num_refs, first_seq); - wait_event(fs_info->tree_mod_seq_wait, - num_refs != delayed_refs->num_entries || - fs_info->tree_mod_seq_list.next != first_seq); - pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, fs_info->tree_mod_seq_list.next); - spin_lock(&delayed_refs->lock); -} - #ifdef SCRAMBLE_DELAYED_REFS /* * Normally delayed refs get processed in ascending bytenr order. This @@ -2460,13 +2438,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct list_head cluster; - struct list_head *first_seq = NULL; int ret; u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - unsigned long num_refs = 0; - int consider_waiting; + int loops; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2484,7 +2460,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: - consider_waiting = 0; + loops = 0; spin_lock(&delayed_refs->lock); #ifdef SCRAMBLE_DELAYED_REFS @@ -2512,31 +2488,6 @@ again: if (ret) break; - if (delayed_start >= delayed_refs->run_delayed_start) { - if (consider_waiting == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked) and if the number of - * refs doesn't change, we avoid busy waiting. - */ - consider_waiting = 1; - num_refs = delayed_refs->num_entries; - first_seq = root->fs_info->tree_mod_seq_list.next; - } else { - wait_for_more_refs(root->fs_info, delayed_refs, - num_refs, first_seq); - /* - * after waiting, things have changed. we - * dropped the lock and someone else might have - * run some refs, built new clusters and so on. - * therefore, we restart staleness detection. - */ - consider_waiting = 0; - } - } - ret = run_clustered_refs(trans, root, &cluster); if (ret < 0) { spin_unlock(&delayed_refs->lock); @@ -2549,9 +2500,26 @@ again: if (count == 0) break; - if (ret || delayed_refs->run_delayed_start == 0) { + if (delayed_start >= delayed_refs->run_delayed_start) { + if (loops == 0) { + /* + * btrfs_find_ref_cluster looped. let's do one + * more cycle. if we don't run any delayed ref + * during that cycle (because we can't because + * all of them are blocked), bail out. + */ + loops = 1; + } else { + /* + * no runnable refs left, stop trying + */ + BUG_ON(run_all); + break; + } + } + if (ret) { /* refs were run, let's reset staleness detection */ - consider_waiting = 0; + loops = 0; } } @@ -5296,9 +5264,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - smp_mb(); - if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) - wake_up(&root->fs_info->tree_mod_seq_wait); /* * we don't take a ref on the node because we're removing it from the -- cgit v1.2.3 From 66657b318e0e443ada229fccd40c8be86cfebdbf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 1 Aug 2012 15:36:24 -0400 Subject: Btrfs: barrier before waitqueue_active We need a barrir before calling waitqueue_active otherwise we will miss wakeups. So in places that do atomic_dec(); then atomic_read() use atomic_dec_return() which imply a memory barrier (see memory-barriers.txt) and then add an explicit memory barrier everywhere else that need them. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/compression.c | 1 + fs/btrfs/delayed-inode.c | 7 +++---- fs/btrfs/disk-io.c | 7 ++++--- fs/btrfs/inode.c | 4 +--- fs/btrfs/volumes.c | 3 +-- 5 files changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 86eff48dab7..43d1c5a3a03 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -818,6 +818,7 @@ static void free_workspace(int type, struct list_head *workspace) btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(alloc_workspace); wake: + smp_mb(); if (waitqueue_active(workspace_wait)) wake_up(workspace_wait); } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 00deed4ef3e..07d5eeb1e6f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -512,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && + if (atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND && waitqueue_active(&delayed_root->wait)) wake_up(&delayed_root->wait); } @@ -1056,8 +1056,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < + if (atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && waitqueue_active(&delayed_root->wait)) wake_up(&delayed_root->wait); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7ad8fc8dc5..dd86a5d8842 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -754,9 +754,7 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - atomic_dec(&fs_info->nr_async_submits); - - if (atomic_read(&fs_info->nr_async_submits) < limit && + if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -3783,14 +3781,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) /* FIXME: cleanup wait for commit */ t->in_commit = 1; t->blocked = 1; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); t->blocked = 0; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); t->commit_done = 1; + smp_mb(); if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2d65c52b094..97baf00b40d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1007,9 +1007,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; - atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); - - if (atomic_read(&root->fs_info->async_delalloc_pages) < + if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b394503bd4..0b1e69d380d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -227,9 +227,8 @@ loop_lock: cur = pending; pending = pending->bi_next; cur->bi_next = NULL; - atomic_dec(&fs_info->nr_async_bios); - if (atomic_read(&fs_info->nr_async_bios) < limit && + if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); -- cgit v1.2.3 From 6fc823b10f333313deb0b5d9069cbfd3a3f99f3a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Aug 2012 13:46:38 -0600 Subject: Btrfs: increase the size of the free space cache Arne was complaining about the space cache having mismatching generation numbers when debugging a deadlock. This is because we can run out of space in our preallocated range for our space cache if you have a pretty fragmented amount of space in your pinned space. So just increase the amount of space we preallocate for space cache so we can be sure to have enough space. This will only really affect data ranges since their the only chunks that end up larger than 256MB. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d3df65f83b5..1bb408f737f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2975,17 +2975,16 @@ again: } spin_unlock(&block_group->lock); - num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; - /* - * Just to make absolutely sure we have enough space, we're going to - * preallocate 12 pages worth of space for each block group. In - * practice we ought to use at most 8, but we need extra space so we can - * add our header and have a terminator between the extents and the - * bitmaps. - */ num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; -- cgit v1.2.3 From b12a3b1ea209d9dec02731fba58c3dbe7d31cfd8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 7 Aug 2012 15:34:49 -0400 Subject: Btrfs: don't run __tree_mod_log_free_eb on leaves When we split a leaf, we may end up inserting a new root on top of that leaf. The reflog code was incorrectly assuming the old root was always a node. This makes sure we skip over leaves. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 08e0b11ba0a..6d183f60d63 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -625,6 +625,9 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) u32 nritems; int ret; + if (btrfs_header_level(eb) == 0) + return; + nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { ret = tree_mod_log_insert_key_locked(fs_info, eb, i, -- cgit v1.2.3 From 22cd2e7de7b0bd68fb668d23e1564707ca689510 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 9 Aug 2012 00:16:53 -0600 Subject: Btrfs: fix race in run_clustered_refs With commit commit d1270cd91f308c9d22b2804720c36ccd32dbc35e Author: Arne Jansen Date: Tue Sep 13 15:16:43 2011 +0200 Btrfs: put back delayed refs that are too new I added a window where the delayed_ref's head->ref_mod code can diverge from the sum of the remaining refs, because we release the head->mutex in the middle. This leads to btrfs_lookup_extent_info returning wrong numbers. This patch fixes this by adjusting the head's ref_mod with each delayed ref we run. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1bb408f737f..f16411d3c25 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2318,6 +2318,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; + if (locked_ref) { + /* + * when we play the delayed ref, also correct the + * ref_mod on head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->node.ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->node.ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + } spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, -- cgit v1.2.3 From c0f62dedd04ae0f3b8a18079db5a015af24e416f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 8 Aug 2012 21:39:36 -0600 Subject: Btrfs: fix wrong mtime and ctime when creating snapshots When we created a new snapshot, the mtime and ctime of its parent directory were not updated. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7208ada41e0..3ee8d58e97a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1026,6 +1026,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, parent_root, parent_inode); if (ret) goto abort_trans_dput; -- cgit v1.2.3 From 5a24e84c55f57cc49bd1cab531b6ef28b6b7bdaa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 Aug 2012 10:12:59 -0600 Subject: Btrfs: fix enospc problems when deleting a subvol Subvol delete is a special kind of awful where we use the global reserve to cover the ENOSPC requirements. The problem is once we're done removing everything we do a btrfs_update_inode(), which by default will try to do the delayed update stuff which will use it's own reserve. There will be no space in this reserve and we'll return ENOSPC. So instead use btrfs_update_inode_fallback() which will just fallback to updating the inode item in the case of enospc. This is fine because the global reserve covers the space requirements for this. With this patch I can now delete a subvol on a problem image Dave Sterba sent me. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97baf00b40d..0808f483daf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3171,7 +3171,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, root, ret); out: -- cgit v1.2.3 From ae1e206b806ccc490dadff59af8a7a2477b32884 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Aug 2012 16:00:32 -0400 Subject: Btrfs: allow delayed refs to be merged Daniel Blueman reported a bug with fio+balance on a ramdisk setup. Basically what happens is the balance relocates a tree block which will drop the implicit refs for all of its children and adds a full backref. Once the block is relocated we have to add the implicit refs back, so when we cow the block again we add the implicit refs for its children back. The problem comes when the original drop ref doesn't get run before we add the implicit refs back. The delayed ref stuff will specifically prefer ADD operations over DROP to keep us from freeing up an extent that will have references to it, so we try to add the implicit ref before it is actually removed and we panic. This worked fine before because the add would have just canceled the drop out and we would have been fine. But the backref walking work needs to be able to freeze the delayed ref stuff in time so we have this ever increasing sequence number that gets attached to all new delayed ref updates which makes us not merge refs and we run into this issue. So to fix this we need to merge delayed refs. So everytime we run a clustered ref we need to try and merge all of its delayed refs. The backref walking stuff locks the delayed ref head before processing, so if we have it locked we are safe to merge any refs inside of the sequence number. If there is no sequence number we can merge all refs. Doing this not only fixes our bug but keeps the delayed ref code from adding and removing useless refs and batching together multiple refs into one search instead of one search per delayed ref, which will really help our commit times. I ran this with Daniels test and 276 and I haven't seen any problems. Thanks, Reported-by: Daniel J Blueman Signed-off-by: Josef Bacik --- fs/btrfs/delayed-ref.c | 155 ++++++++++++++++++++++++++++++++++++++++--------- fs/btrfs/delayed-ref.h | 4 ++ fs/btrfs/extent-tree.c | 10 ++++ 3 files changed, 142 insertions(+), 27 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 7561431af50..ae941177339 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -38,17 +38,14 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, struct btrfs_delayed_tree_ref *ref1) { - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; return 0; } @@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, * type of the delayed backrefs and content of delayed backrefs. */ static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1) + struct btrfs_delayed_ref_node *ref1, + bool compare_seq) { if (ref1->bytenr < ref2->bytenr) return -1; @@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, if (ref1->type > ref2->type) return 1; /* merging of sequenced refs is not allowed */ - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; + if (compare_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), @@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, rb_node); - cmp = comp_entry(entry, ins); + cmp = comp_entry(entry, ins, 1); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) @@ -233,6 +233,114 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } +static void inline drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *ref) +{ + rb_erase(&ref->rb_node, &delayed_refs->root); + ref->in_tree = 0; + btrfs_put_delayed_ref(ref); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; +} + +static int merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *ref, u64 seq) +{ + struct rb_node *node; + int merged = 0; + int mod = 0; + int done = 0; + + node = rb_prev(&ref->rb_node); + while (node) { + struct btrfs_delayed_ref_node *next; + + next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_prev(node); + if (next->bytenr != ref->bytenr) + break; + if (seq && next->seq >= seq) + break; + if (comp_entry(ref, next, 0)) + continue; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + struct btrfs_delayed_ref_node *tmp; + + tmp = ref; + ref = next; + next = tmp; + done = 1; + } + mod = -next->ref_mod; + } + + merged++; + drop_delayed_ref(trans, delayed_refs, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, ref); + break; + } else { + /* + * You can't have multiples of the same ref on a tree + * block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (done) + break; + node = rb_prev(&ref->rb_node); + } + + return merged; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + u64 seq = 0; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + node = rb_prev(&head->node.rb_node); + while (node) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + + /* We can't merge refs that are outside of our seq count */ + if (seq && ref->seq >= seq) + break; + if (merge_ref(trans, delayed_refs, ref, seq)) + node = rb_prev(&head->node.rb_node); + else + node = rb_prev(node); + } +} + int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -336,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans, * every changing the extent allocation tree. */ existing->ref_mod--; - if (existing->ref_mod == 0) { - rb_erase(&existing->rb_node, - &delayed_refs->root); - existing->in_tree = 0; - btrfs_put_delayed_ref(existing); - delayed_refs->num_entries--; - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; - } else { + if (existing->ref_mod == 0) + drop_delayed_ref(trans, delayed_refs, existing); + else WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - } } else { WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 0d7c90c366b..ab530059584 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -167,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f16411d3c25..ba58024d40d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2251,6 +2251,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + */ + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); + /* * locked_ref is the head node, so we have to go one * node back for any delayed ref updates -- cgit v1.2.3 From 68ce9682a4bb95d6be5529cb57214bf2a1b7d20e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 1 Aug 2012 05:45:52 -0600 Subject: Btrfs: remove superblock writing after fatal error With commit acce952b0, btrfs was changed to flag the filesystem with BTRFS_SUPER_FLAG_ERROR and switch to read-only mode after a fatal error happened like a write I/O errors of all mirrors. In such situations, on unmount, the superblock is written in btrfs_error_commit_super(). This is done with the intention to be able to evaluate the error flag on the next mount. A warning is printed in this case during the next mount and the log tree is ignored. The issue is that it is possible that the superblock points to a root that was not written (due to write I/O errors). The result is that the filesystem cannot be mounted. btrfsck also does not start and all the other btrfs-progs tools fail to start as well. However, mount -o recovery is working well and does the right things to recover the filesystem (i.e., don't use the log root, clear the free space cache and use the next mountable root that is stored in the root backup array). This patch removes the writing of the superblock when BTRFS_SUPER_FLAG_ERROR is set, and removes the handling of the error flag in the mount function. These lines can be used to reproduce the issue (using /dev/sdm): SCRATCH_DEV=/dev/sdm SCRATCH_MNT=/mnt echo 0 25165824 linear $SCRATCH_DEV 0 | dmsetup create foo ls -alLF /dev/mapper/foo mkfs.btrfs /dev/mapper/foo mount /dev/mapper/foo $SCRATCH_MNT echo bar > $SCRATCH_MNT/foo sync echo 0 25165824 error | dmsetup reload foo dmsetup resume foo ls -alF $SCRATCH_MNT touch $SCRATCH_MNT/1 ls -alF $SCRATCH_MNT sleep 35 echo 0 25165824 linear $SCRATCH_DEV 0 | dmsetup reload foo dmsetup resume foo sleep 1 umount $SCRATCH_MNT btrfsck /dev/mapper/foo dmsetup remove foo Signed-off-by: Stefan Behrens Signed-off-by: Jan Schmidt --- fs/btrfs/disk-io.c | 36 ++++-------------------------------- fs/btrfs/disk-io.h | 2 +- 2 files changed, 5 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dd86a5d8842..3c4c4397f47 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2527,8 +2527,7 @@ retry_root_backup: goto fail_trans_kthread; /* do not make disk changes in broken FS */ - if (btrfs_super_log_root(disk_super) != 0 && - !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -3188,30 +3187,14 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(fs_info); - /* - * Here come 2 situations when btrfs is broken to flip readonly: - * - * 1. when btrfs flips readonly somewhere else before - * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, - * and btrfs will skip to write sb directly to keep - * ERROR state on disk. - * - * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannot write sb via btrfs_commit_super, - * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, - * btrfs will cleanup all FS resources first and write sb then. - */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - ret = btrfs_error_commit_super(root); - if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); - } + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3433,18 +3416,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (read_only) return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - printk(KERN_WARNING "warning: mount fs with errors, " - "running btrfsck is recommended\n"); - } - return 0; } -int btrfs_error_commit_super(struct btrfs_root *root) +void btrfs_error_commit_super(struct btrfs_root *root) { - int ret; - mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -3454,10 +3430,6 @@ int btrfs_error_commit_super(struct btrfs_root *root) /* cleanup FS via transaction */ btrfs_cleanup_transaction(root); - - ret = write_ctree_super(NULL, root, 0); - - return ret; } static void btrfs_destroy_ordered_operations(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 95e147eea23..c5b00a735fe 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -int btrfs_error_commit_super(struct btrfs_root *root); +void btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, -- cgit v1.2.3 From 5ee0844d6427e7338e0aba748f62b62d07ea2ed0 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 27 Aug 2012 08:30:03 -0600 Subject: Btrfs: revert checksum error statistic which can cause a BUG() Commit 442a4f6308e694e0fa6025708bd5e4e424bbf51c added btrfs device statistic counters for detected IO and checksum errors to Linux 3.5. The statistic part that counts checksum errors in end_bio_extent_readpage() can cause a BUG() in a subfunction: "kernel BUG at fs/btrfs/volumes.c:3762!" That part is reverted with the current patch. However, the counting of checksum errors in the scrub context remains active, and the counting of detected IO errors (read, write or flush errors) in all contexts remains active. Cc: stable # 3.5 Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 17 ++--------------- fs/btrfs/volumes.c | 22 ---------------------- fs/btrfs/volumes.h | 2 -- 3 files changed, 2 insertions(+), 39 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e7c9ed6505..49085f2336d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2329,23 +2329,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); - if (ret) { - /* no IO indicated but software detected errors - * in the block, either checksum errors or - * issues with the contents */ - struct btrfs_root *root = - BTRFS_I(page->mapping->host)->root; - struct btrfs_device *device; - + if (ret) uptodate = 0; - device = btrfs_find_device_for_logical( - root, start, mirror); - if (device) - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - } else { + else clean_io_failure(start, page); - } } if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0b1e69d380d..3f4e70e171e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4610,28 +4610,6 @@ int btrfs_read_sys_array(struct btrfs_root *root) return ret; } -struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, - u64 logical, int mirror_num) -{ - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - int ret; - u64 map_length = 0; - struct btrfs_bio *bbio = NULL; - struct btrfs_device *device; - - BUG_ON(mirror_num == 0); - ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, - mirror_num); - if (ret) { - BUG_ON(bbio != NULL); - return NULL; - } - BUG_ON(mirror_num != bbio->mirror_num); - device = bbio->stripes[mirror_num - 1].dev; - kfree(bbio); - return device; -} - int btrfs_read_chunk_tree(struct btrfs_root *root) { struct btrfs_path *path; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5479325987b..53c06af92e8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -289,8 +289,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); -struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, - u64 logical, int mirror_num); void btrfs_dev_stat_print_on_error(struct btrfs_device *device); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, -- cgit v1.2.3 From bd7de2c9a449e26a5493d918618eb20ae60d56bd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Aug 2012 12:53:03 -0600 Subject: Btrfs: fix deadlock with freeze and sync V2 We can deadlock with freeze right now because we unconditionally start a transaction in our ->sync_fs() call. To fix this just check and see if we have a running transaction to commit. This saves us from the deadlock because at this point we'll have the umount sem for the sb so we're safe from freezes coming in after we've done our check. With this patch the freeze xfstests no longer deadlocks. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2e06f124f28..073c2368f45 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -813,7 +813,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - int ret; trace_btrfs_sync_fs(wait); @@ -824,11 +823,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); - trans = btrfs_start_transaction(root, 0); + spin_lock(&fs_info->trans_lock); + if (!fs_info->running_transaction) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + spin_unlock(&fs_info->trans_lock); + + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - return ret; + return btrfs_commit_transaction(trans, root); } static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) -- cgit v1.2.3 From 24c03fa5cf3d02c327cf9f2fc39f72664b1bd7e1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 22 Aug 2012 20:10:38 -0600 Subject: Btrfs: fix a dio write regression This bug is introduced by commit 3b8bde746f6f9bd36a9f05f5f3b6e334318176a9 (Btrfs: lock extents as we map them in DIO). In dio write, we should unlock the section which we didn't do IO on in case that we fall back to buffered write. But we need to not only unlock the section but also cleanup reserved space for the section. This bug was found while running xfstests 133, with this 133 no longer complains. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0808f483daf..38cda78de5e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5992,11 +5992,27 @@ unlock: * in the case of read we need to unlock only the end area that we * aren't using if there is any left over space. */ - if (lockstart < lockend) - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state, GFP_NOFS); - else + if (lockstart < lockend) { + if (create && len < lockend - lockstart) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); + /* + * Beside unlock, we also need to cleanup reserved space + * for the left range by attaching EXTENT_DO_ACCOUNTING. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, + lockstart + len, lockend, + unlock_bits | EXTENT_DO_ACCOUNTING, + 1, 0, NULL, GFP_NOFS); + } else { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); + } + } else { free_extent_state(cached_state); + } free_extent_map(em); -- cgit v1.2.3 From d280e5be940931c84bb2e9831ead9d02bc785484 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 21 Aug 2012 21:13:25 -0600 Subject: Btrfs: fix ordered extent leak when failing to start a transaction We cannot just return error before freeing ordered extent and releasing reserved space when we fail to start a transacion. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 38cda78de5e..6ba80b90287 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1882,8 +1882,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ -- cgit v1.2.3 From 256dd1bb3750ac5ad49b40887c1691788dc44b33 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 10 Aug 2012 08:58:21 -0600 Subject: Btrfs: fix that repair code is spuriously executed for transid failures If verify_parent_transid() fails for all mirrors, the current code calls repair_io_failure() anyway which means: - that the disk block is rewritten without repairing anything and - that a kernel log message is printed which misleadingly claims that a read error was corrected. This is an example: parent transid verify failed on 615015833600 wanted 110423 found 110424 parent transid verify failed on 615015833600 wanted 110423 found 110424 btrfs read error corrected: ino 1 off 615015833600 (dev /dev/...) It is wrong to ignore the results from verify_parent_transid() and to call repair_eb_io_failure() when the verification of the transids failed. This commit fixes the issue. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3c4c4397f47..29c69e60d3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -377,9 +377,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, + if (!ret) { + if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; + break; + else + ret = -EIO; + } /* * This buffer's crc is fine, but its contents are corrupted, so -- cgit v1.2.3 From 8bad3c024496e30948f576485c10ecc9ebdfe41d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 18 Jun 2012 12:14:23 +0800 Subject: btrfs: fix comment typo in btrfs_finish_ordered_io Fix typo errors in comments of btrfs_finish_ordered_io. Signed-off-by: Liu Bo Signed-off-by: Jiri Kosina --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb8d671d00e..d8ed4fda001 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1969,8 +1969,8 @@ out: ordered_extent->len - 1, NULL, GFP_NOFS); /* - * This needs to be dont to make sure anybody waiting knows we are done - * upating everything for this ordered extent. + * This needs to be done to make sure anybody waiting knows we are done + * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); -- cgit v1.2.3 From 527a1361387cd132a577dc8a6ae676cfc1e8414b Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Thu, 6 Sep 2012 13:33:37 +0800 Subject: btrfs: fix trivial typo for the comment of BTRFS_FREE_INO_OBJECTID It should be storing, not sotring. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jiri Kosina --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fa5c45b3907..11d0e09e161 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -113,7 +113,7 @@ struct btrfs_ordered_sum; #define BTRFS_FREE_SPACE_OBJECTID -11ULL /* - * The inode number assigned to the special inode for sotring + * The inode number assigned to the special inode for storing * free ino cache */ #define BTRFS_FREE_INO_OBJECTID -12ULL -- cgit v1.2.3 From f3a87f1b0c4086a408eda48e4c26e32ff80d3124 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 14 Sep 2012 20:06:30 -0400 Subject: Revert "Btrfs: fix some error codes in btrfs_qgroup_inherit()" This reverts commit 5986802c2fcc754040bb7ed95f30bb16c4a843b7. Both paths are not error paths but regular cases where non-qgroup subvols are involved. Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 38b42e7bc91..b6501558174 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1371,10 +1371,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); - if (!srcgroup) { - ret = -EINVAL; + if (!srcgroup) goto unlock; - } dstgroup->rfer = srcgroup->rfer - level_size; dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; srcgroup->excl = level_size; @@ -1383,10 +1381,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, qgroup_dirty(fs_info, srcgroup); } - if (!inherit) { - ret = -EINVAL; + if (!inherit) goto unlock; - } i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { -- cgit v1.2.3 From 5f3a4a28ec140a90e6058d1d09f6b1f235d485e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 20:17:44 -0700 Subject: userns: Pass a userns parameter into posix_acl_to_xattr and posix_acl_from_xattr - Pass the user namespace the uid and gid values in the xattr are stored in into posix_acl_from_xattr. - Pass the user namespace kuid and kgid values should be converted into when storing uid and gid values in an xattr in posix_acl_to_xattr. - Modify all callers of posix_acl_from_xattr and posix_acl_to_xattr to pass in &init_user_ns. In the short term this change is not strictly needed but it makes the code clearer. In the longer term this change is necessary to be able to mount filesystems outside of the initial user namespace that natively store posix acls in the linux xattr format. Cc: Theodore Tso Cc: Andrew Morton Cc: Andreas Dilger Cc: Jan Kara Cc: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/btrfs/acl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 761e2cd8fed..0c16e3dbfd5 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); } if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; @@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return ret; @@ -141,7 +141,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) goto out; } @@ -169,7 +169,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, return -EOPNOTSUPP; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); -- cgit v1.2.3 From 44a075bde9ab32d13bb2ff89f3e72fd869f272c4 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 21 Sep 2012 08:21:01 +0800 Subject: btrfs: fix the commment for the action flags in delayed-ref.h The action field has been merged into struct btrfs_delayed_ref_node, and no struct btrfs_delayed_ref is available now. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jiri Kosina --- fs/btrfs/delayed-ref.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 413927fb995..25589456d1c 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,7 +18,7 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ -/* these are the possible values of struct btrfs_delayed_ref->action */ +/* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -- cgit v1.2.3 From 2f2f43d3c7b1da8dba56716dd1be196b6f57bf9b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Feb 2012 11:05:07 -0800 Subject: userns: Convert btrfs to use kuid/kgid where appropriate Cc: Chris Mason Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/btrfs/delayed-inode.c | 8 ++++---- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 335605c8cea..f908c518079 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1715,8 +1715,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) { - btrfs_set_stack_inode_uid(inode_item, inode->i_uid); - btrfs_set_stack_inode_gid(inode_item, inode->i_gid); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); btrfs_set_stack_inode_mode(inode_item, inode->i_mode); btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); @@ -1764,8 +1764,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - inode->i_uid = btrfs_stack_inode_uid(inode_item); - inode->i_gid = btrfs_stack_inode_gid(inode_item); + i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83baec24946..53687149c07 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2570,8 +2570,8 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - inode->i_uid = btrfs_inode_uid(leaf, inode_item); - inode->i_gid = btrfs_inode_gid(leaf, inode_item); + i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); tspec = btrfs_inode_atime(inode_item); @@ -2649,8 +2649,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, inode->i_uid); - btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); btrfs_set_inode_mode(leaf, item, inode->i_mode); btrfs_set_inode_nlink(leaf, item, inode->i_nlink); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bc2f6ffff3c..1292682c537 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -575,13 +575,13 @@ fail: */ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) { - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); if (!(dir->i_mode & S_ISVTX)) return 0; - if (inode->i_uid == fsuid) + if (uid_eq(inode->i_uid, fsuid)) return 0; - if (dir->i_uid == fsuid) + if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable(CAP_FOWNER); } -- cgit v1.2.3 From ecd188159efa112770d5f8a4a62f8d3586784f48 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Aug 2012 21:20:24 -0400 Subject: switch btrfs_ioctl_snap_create_transid() to fget_light() Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9df50fa8a07..3c88abb0e26 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1422,7 +1422,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, NULL, transid, readonly, inherit); } else { struct inode *src_inode; - src_file = fget(fd); + int fput_needed; + src_file = fget_light(fd, &fput_needed); if (!src_file) { ret = -EINVAL; goto out_drop_write; @@ -1433,13 +1434,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, printk(KERN_INFO "btrfs: Snapshot src from " "another FS\n"); ret = -EINVAL; - fput(src_file); - goto out_drop_write; + } else { + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid, readonly, inherit); } - ret = btrfs_mksubvol(&file->f_path, name, namelen, - BTRFS_I(src_inode)->root, - transid, readonly, inherit); - fput(src_file); + fput_light(src_file, fput_needed); } out_drop_write: mnt_drop_write_file(file); -- cgit v1.2.3 From 8319aa9127a1282b24c3ece473a058d246f35b0d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 03:18:55 -0400 Subject: switch btrfs_ioctl_clone() to fget_light() Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3c88abb0e26..3494f2f4416 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2350,7 +2350,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, struct btrfs_key key; u32 nritems; int slot; - int ret; + int ret, fput_needed; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; u64 hint_byte; @@ -2376,7 +2376,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (ret) return ret; - src_file = fget(srcfd); + src_file = fget_light(srcfd, &fput_needed); if (!src_file) { ret = -EBADF; goto out_drop_write; @@ -2724,7 +2724,7 @@ out_unlock: vfree(buf); btrfs_free_path(path); out_fput: - fput(src_file); + fput_light(src_file, fput_needed); out_drop_write: mnt_drop_write_file(file); return ret; -- cgit v1.2.3 From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Aug 2012 12:52:22 -0400 Subject: switch simple cases of fget_light to fdget Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3494f2f4416..0a4f0c8bc58 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1397,7 +1397,6 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, u64 *transid, bool readonly, struct btrfs_qgroup_inherit **inherit) { - struct file *src_file; int namelen; int ret = 0; @@ -1421,15 +1420,14 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, ret = btrfs_mksubvol(&file->f_path, name, namelen, NULL, transid, readonly, inherit); } else { + struct fd src = fdget(fd); struct inode *src_inode; - int fput_needed; - src_file = fget_light(fd, &fput_needed); - if (!src_file) { + if (!src.file) { ret = -EINVAL; goto out_drop_write; } - src_inode = src_file->f_path.dentry->d_inode; + src_inode = src.file->f_path.dentry->d_inode; if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { printk(KERN_INFO "btrfs: Snapshot src from " "another FS\n"); @@ -1439,7 +1437,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, BTRFS_I(src_inode)->root, transid, readonly, inherit); } - fput_light(src_file, fput_needed); + fdput(src); } out_drop_write: mnt_drop_write_file(file); @@ -2341,7 +2339,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct file *src_file; + struct fd src_file; struct inode *src; struct btrfs_trans_handle *trans; struct btrfs_path *path; @@ -2350,7 +2348,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, struct btrfs_key key; u32 nritems; int slot; - int ret, fput_needed; + int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; u64 hint_byte; @@ -2376,24 +2374,24 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (ret) return ret; - src_file = fget_light(srcfd, &fput_needed); - if (!src_file) { + src_file = fdget(srcfd); + if (!src_file.file) { ret = -EBADF; goto out_drop_write; } ret = -EXDEV; - if (src_file->f_path.mnt != file->f_path.mnt) + if (src_file.file->f_path.mnt != file->f_path.mnt) goto out_fput; - src = src_file->f_dentry->d_inode; + src = src_file.file->f_dentry->d_inode; ret = -EINVAL; if (src == inode) goto out_fput; /* the src must be open for reading */ - if (!(src_file->f_mode & FMODE_READ)) + if (!(src_file.file->f_mode & FMODE_READ)) goto out_fput; /* don't make the dst file partly checksummed */ @@ -2724,7 +2722,7 @@ out_unlock: vfree(buf); btrfs_free_path(path); out_fput: - fput_light(src_file, fput_needed); + fdput(src_file); out_drop_write: mnt_drop_write_file(file); return ret; -- cgit v1.2.3 From 85a7b33b9653ade7b26b9f29765cb1f0719c1e84 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Thu, 26 Jul 2012 23:39:10 +0200 Subject: Btrfs: add rdev to get_inode_info in send/receive We need rdev in the next commit. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fb5ffe95f86..5721401548e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -687,7 +687,8 @@ out: */ static int get_inode_info(struct btrfs_root *root, u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid) + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) { int ret; struct btrfs_inode_item *ii; @@ -721,6 +722,8 @@ static int get_inode_info(struct btrfs_root *root, *uid = btrfs_inode_uid(path->nodes[0], ii); if (gid) *gid = btrfs_inode_gid(path->nodes[0], ii); + if (rdev) + *rdev = btrfs_inode_rdev(path->nodes[0], ii); out: btrfs_free_path(path); @@ -1081,7 +1084,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * There are inodes that have extents that lie behind it's i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL); + ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, + NULL); if (ret < 0) return ret; @@ -1404,7 +1408,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) u64 right_gen; ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; left_ret = ret; @@ -1413,7 +1417,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; right_ret = ret; @@ -1557,7 +1561,7 @@ static int get_first_ref(struct send_ctx *sctx, btrfs_release_path(path); ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -1628,7 +1632,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (other_inode > sctx->send_progress) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, NULL, NULL, NULL); + who_gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1671,7 +1675,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, } ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -2540,7 +2544,7 @@ int __finish_unordered_dir(int num, struct btrfs_key *di_key, fs_path_reset(fctx->cur_path); ret = get_inode_info(sctx->send_root, di_key->objectid, - NULL, &di_gen, &di_mode, NULL, NULL); + NULL, &di_gen, &di_mode, NULL, NULL, NULL); if (ret < 0) goto out; @@ -2971,7 +2975,7 @@ static int __record_new_ref(int num, u64 dir, int index, return -ENOMEM; ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -3029,7 +3033,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, return -ENOMEM; ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; @@ -3642,7 +3646,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (clone_root2 == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL); + &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -4004,7 +4008,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid); + &left_mode, &left_uid, &left_gid, NULL); if (ret < 0) goto out; @@ -4015,7 +4019,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } else { ret = get_inode_info(sctx->parent_root, sctx->cur_ino, NULL, NULL, &right_mode, &right_uid, - &right_gid); + &right_gid, NULL); if (ret < 0) goto out; -- cgit v1.2.3 From 1f4692da951af4179a6522c6b48a09a43d37e614 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 10:42:24 +0200 Subject: Btrfs: fix cur_ino < parent_ino case for send/receive When the current inodes inum is smaller then the inum of the parent directory strange things were happending due to wrong path resolution and other bugs. Fix this with a new approach for the problem. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 390 +++++++++++++++++++++----------------------------------- 1 file changed, 146 insertions(+), 244 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5721401548e..a5fae484d4e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -107,7 +107,6 @@ struct send_ctx { int cur_inode_new; int cur_inode_new_gen; int cur_inode_deleted; - int cur_inode_first_ref_orphan; u64 cur_inode_size; u64 cur_inode_mode; @@ -2296,25 +2295,25 @@ out: * a valid path yet because we did not process the refs yet. So, the inode * is created as orphan. */ -static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path, - struct btrfs_key *key) +static int send_create_inode(struct send_ctx *sctx, u64 ino) { int ret = 0; - struct extent_buffer *eb = path->nodes[0]; - struct btrfs_inode_item *ii; struct fs_path *p; - int slot = path->slots[0]; int cmd; + u64 gen; u64 mode; + u64 rdev; -verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); +verbose_printk("btrfs: send_create_inode %llu\n", ino); p = fs_path_alloc(sctx); if (!p) return -ENOMEM; - ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); - mode = btrfs_inode_mode(eb, ii); + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, + NULL, &rdev); + if (ret < 0) + goto out; if (S_ISREG(mode)) cmd = BTRFS_SEND_C_MKFILE; @@ -2339,22 +2338,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino); if (ret < 0) goto out; - ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + ret = gen_unique_name(sctx, ino, gen, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino); + TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); if (S_ISLNK(mode)) { fs_path_reset(p); - ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p); + ret = read_symlink(sctx, sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { - TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev); } ret = send_cmd(sctx); @@ -2368,6 +2367,92 @@ out: return ret; } +/* + * We need some special handling for inodes that get processed before the parent + * directory got created. See process_recorded_refs for details. + * This function does the check if we already created the dir out of order. + */ +static int did_create_dir(struct send_ctx *sctx, u64 dir) +{ + int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key di_key; + struct extent_buffer *eb; + struct btrfs_dir_item *di; + int slot; + + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = dir; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + while (1) { + ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, + 1, 0); + if (ret < 0) + goto out; + if (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + } + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + + if (di_key.objectid < sctx->send_progress) { + ret = 1; + goto out; + } + + key.offset = found_key.offset + 1; + btrfs_release_path(path); + } + +out: + btrfs_free_path(path); + return ret; +} + +/* + * Only creates the inode if it is: + * 1. Not a directory + * 2. Or a directory which was not created already due to out of order + * directories. See did_create_dir and process_recorded_refs for details. + */ +static int send_create_inode_if_needed(struct send_ctx *sctx) +{ + int ret; + + if (S_ISDIR(sctx->cur_inode_mode)) { + ret = did_create_dir(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + if (ret) { + ret = 0; + goto out; + } + } + + ret = send_create_inode(sctx, sctx->cur_ino); + if (ret < 0) + goto out; + +out: + return ret; +} + struct recorded_ref { struct list_head list; char *dir_path; @@ -2517,160 +2602,6 @@ out: return ret; } -struct finish_unordered_dir_ctx { - struct send_ctx *sctx; - struct fs_path *cur_path; - struct fs_path *dir_path; - u64 dir_ino; - int need_delete; - int delete_pass; -}; - -int __finish_unordered_dir(int num, struct btrfs_key *di_key, - const char *name, int name_len, - const char *data, int data_len, - u8 type, void *ctx) -{ - int ret = 0; - struct finish_unordered_dir_ctx *fctx = ctx; - struct send_ctx *sctx = fctx->sctx; - u64 di_gen; - u64 di_mode; - int is_orphan = 0; - - if (di_key->objectid >= fctx->dir_ino) - goto out; - - fs_path_reset(fctx->cur_path); - - ret = get_inode_info(sctx->send_root, di_key->objectid, - NULL, &di_gen, &di_mode, NULL, NULL, NULL); - if (ret < 0) - goto out; - - ret = is_first_ref(sctx, sctx->send_root, di_key->objectid, - fctx->dir_ino, name, name_len); - if (ret < 0) - goto out; - if (ret) { - is_orphan = 1; - ret = gen_unique_name(sctx, di_key->objectid, di_gen, - fctx->cur_path); - } else { - ret = get_cur_path(sctx, di_key->objectid, di_gen, - fctx->cur_path); - } - if (ret < 0) - goto out; - - ret = fs_path_add(fctx->dir_path, name, name_len); - if (ret < 0) - goto out; - - if (!fctx->delete_pass) { - if (S_ISDIR(di_mode)) { - ret = send_rename(sctx, fctx->cur_path, - fctx->dir_path); - } else { - ret = send_link(sctx, fctx->dir_path, - fctx->cur_path); - if (is_orphan) - fctx->need_delete = 1; - } - } else if (!S_ISDIR(di_mode)) { - ret = send_unlink(sctx, fctx->cur_path); - } else { - ret = 0; - } - - fs_path_remove(fctx->dir_path); - -out: - return ret; -} - -/* - * Go through all dir items and see if we find refs which could not be created - * in the past because the dir did not exist at that time. - */ -static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen) -{ - int ret = 0; - struct btrfs_path *path = NULL; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *eb; - struct finish_unordered_dir_ctx fctx; - int slot; - - path = alloc_path_for_send(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - memset(&fctx, 0, sizeof(fctx)); - fctx.sctx = sctx; - fctx.cur_path = fs_path_alloc(sctx); - fctx.dir_path = fs_path_alloc(sctx); - if (!fctx.cur_path || !fctx.dir_path) { - ret = -ENOMEM; - goto out; - } - fctx.dir_ino = dir; - - ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path); - if (ret < 0) - goto out; - - /* - * We do two passes. The first links in the new refs and the second - * deletes orphans if required. Deletion of orphans is not required for - * directory inodes, as we always have only one ref and use rename - * instead of link for those. - */ - -again: - key.objectid = dir; - key.type = BTRFS_DIR_ITEM_KEY; - key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, - 1, 0); - if (ret < 0) - goto out; - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); - - if (found_key.objectid != key.objectid || - found_key.type != key.type) { - btrfs_release_path(path); - break; - } - - ret = iterate_dir_item(sctx, sctx->send_root, path, - &found_key, __finish_unordered_dir, - &fctx); - if (ret < 0) - goto out; - - key.offset = found_key.offset + 1; - btrfs_release_path(path); - } - - if (!fctx.delete_pass && fctx.need_delete) { - fctx.delete_pass = 1; - goto again; - } - -out: - btrfs_free_path(path); - fs_path_free(sctx, fctx.cur_path); - fs_path_free(sctx, fctx.dir_path); - return ret; -} - /* * This does all the move/link/unlink/rmdir magic. */ @@ -2678,6 +2609,7 @@ static int process_recorded_refs(struct send_ctx *sctx) { int ret = 0; struct recorded_ref *cur; + struct recorded_ref *cur2; struct ulist *check_dirs = NULL; struct ulist_iterator uit; struct ulist_node *un; @@ -2734,6 +2666,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * the the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + /* * Check if this new ref would overwrite the first ref of * another unprocessed inode. If yes, orphanize the @@ -2768,7 +2740,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan && !sctx->cur_inode_first_ref_orphan) { + if (is_orphan) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -2844,35 +2816,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (!ret) { - /* - * In case the inode was moved to a directory - * that was not created yet (see - * __record_new_ref), we can not unlink the ref - * as it will be needed later when the parent - * directory is created, so that we can move in - * the inode to the new dir. - */ - if (!is_orphan && - sctx->cur_inode_first_ref_orphan) { - ret = orphanize_inode(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - cur->full_path); - if (ret < 0) - goto out; - ret = gen_unique_name(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - valid_path); - if (ret < 0) - goto out; - is_orphan = 1; - - } else { - ret = send_unlink(sctx, cur->full_path); - if (ret < 0) - goto out; - } + ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; } ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, GFP_NOFS); @@ -2885,11 +2831,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * happen when a previous inode did overwrite the first ref * of this inode and no new refs were added for the current * inode. - * We can however not delete the orphan in case the inode relies - * in a directory that was not created yet (see - * __record_new_ref) */ - if (is_orphan && !sctx->cur_inode_first_ref_orphan) { + if (is_orphan) { ret = send_unlink(sctx, valid_path); if (ret < 0) goto out; @@ -2939,19 +2882,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ sctx->send_progress = sctx->cur_ino + 1; - /* - * We may have a directory here that has pending refs which could not - * be created before (because the dir did not exist before, see - * __record_new_ref). finish_outoforder_dir will link/move the pending - * refs. - */ - if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) { - ret = finish_outoforder_dir(sctx, sctx->cur_ino, - sctx->cur_inode_gen); - if (ret < 0) - goto out; - } - ret = 0; out: @@ -2979,31 +2909,6 @@ static int __record_new_ref(int num, u64 dir, int index, if (ret < 0) goto out; - /* - * The parent may be non-existent at this point in time. This happens - * if the ino of the parent dir is higher then the current ino. In this - * case, we can not process this ref until the parent dir is finally - * created. If we reach the parent dir later, process_recorded_refs - * will go through all dir items and process the refs that could not be - * processed before. In case this is the first ref, we set - * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to - * keep an orphan of the inode so that it later can be used for - * link/move - */ - ret = is_inode_existent(sctx, dir, gen); - if (ret < 0) - goto out; - if (!ret) { - ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir, - name->start, fs_path_len(name)); - if (ret < 0) - goto out; - if (ret) - sctx->cur_inode_first_ref_orphan = 1; - ret = 0; - goto out; - } - ret = get_cur_path(sctx, dir, gen, p); if (ret < 0) goto out; @@ -4078,7 +3983,6 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; - sctx->cur_inode_first_ref_orphan = 0; sctx->send_progress = sctx->cur_ino; if (result == BTRFS_COMPARE_TREE_NEW || @@ -4115,8 +4019,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - ret = send_create_inode(sctx, sctx->left_path, - sctx->cmp_key); + ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; sctx->cur_inode_new = 0; @@ -4146,8 +4049,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); - ret = send_create_inode(sctx, sctx->left_path, - sctx->cmp_key); + ret = send_create_inode_if_needed(sctx); if (ret < 0) goto out; -- cgit v1.2.3 From b9291affaa4576762c30fb31083f33f5d745dea1 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 11:07:18 +0200 Subject: Btrfs: add missing check for dir != tmp_dir to is_first_ref We missed that check which resultet in all refs with the same name being reported as first_ref. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a5fae484d4e..bea5b4378cc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1589,7 +1589,7 @@ static int is_first_ref(struct send_ctx *sctx, if (ret < 0) goto out; - if (name_len != fs_path_len(tmp_name)) { + if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { ret = 0; goto out; } -- cgit v1.2.3 From 9ea3ef516d828e435d283b7d5f0de05fd72a23ac Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 11:08:09 +0200 Subject: Btrfs: remove unused code with #if 0 fs_path_remove is not used at the moment due to a previous patch. Remove it for now (with #if 0) to avoid compile warnings. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bea5b4378cc..9e3f6d127d8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -327,6 +327,7 @@ out: return ret; } +#if 0 static void fs_path_remove(struct fs_path *p) { BUG_ON(p->reversed); @@ -334,6 +335,7 @@ static void fs_path_remove(struct fs_path *p) p->end--; *p->end = 0; } +#endif static int fs_path_copy(struct fs_path *p, struct fs_path *from) { -- cgit v1.2.3 From ccf1626b49a94d66f3bb58d634888049ac695b46 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 11:46:29 +0200 Subject: Btrfs: add correct parent to check_dirs when dir got moved We only added the parent for the new position of a moved dir. We also need to add the old parent of the moved dir. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9e3f6d127d8..328654ea440 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2805,6 +2805,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; } + } else if (S_ISDIR(sctx->cur_inode_mode) && + !list_empty(&sctx->deleted_refs)) { + /* + * We have a moved dir. Add the old parent to check_dirs + */ + cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, + list); + ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, + GFP_NOFS); + if (ret < 0) + goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { /* * We have a non dir inode. Go through all deleted refs and -- cgit v1.2.3 From d27aed5e24f8e7bb2b0d11e7a579f2bbdebafc2f Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 12:04:16 +0200 Subject: Btrfs: remove unused use_list from send/receive code use_list is a leftover and unused. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 328654ea440..f1d44b125be 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -125,7 +125,6 @@ struct send_ctx { struct name_cache_entry { struct list_head list; - struct list_head use_list; u64 ino; u64 gen; u64 parent_ino; @@ -1906,7 +1905,6 @@ out_cache: nce->name_len = fs_path_len(dest); nce->ret = ret; strcpy(nce->name, dest->start); - memset(&nce->use_list, 0, sizeof(nce->use_list)); if (ino < sctx->send_progress) nce->need_later_update = 0; -- cgit v1.2.3 From ee849c0472a9fe1dc09fe8390965d993b9c4e979 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 12:42:05 +0200 Subject: Btrfs: rename backref_ctx::found_in_send_root to found_itself The new name should be easier to understand/read. Commit is a result of Arne's review. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f1d44b125be..a25be0c1a6b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1029,7 +1029,7 @@ struct backref_ctx { u64 extent_len; /* Just to check for bugs in backref resolving */ - int found_in_send_root; + int found_itself; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1077,7 +1077,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (found->root == bctx->sctx->send_root && ino == bctx->cur_objectid && offset == bctx->cur_offset) { - bctx->found_in_send_root = 1; + bctx->found_itself = 1; } /* @@ -1210,7 +1210,7 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx.found = 0; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; - backref_ctx.found_in_send_root = 0; + backref_ctx.found_itself = 0; backref_ctx.extent_len = num_bytes; /* @@ -1231,7 +1231,7 @@ static int find_extent_clone(struct send_ctx *sctx, if (ret < 0) goto out; - if (!backref_ctx.found_in_send_root) { + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; printk(KERN_ERR "btrfs: ERROR did not find backref in " -- cgit v1.2.3 From 35075bb046cc91f42a0e5336bdc07f3279061add Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 12:44:34 +0200 Subject: Btrfs: use kmalloc instead of stack for backref_ctx Make sure to never get in trouble due to the backref_ctx which was on the stack before. Commit is a result of Arne's review. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a25be0c1a6b..36711123bdc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1149,7 +1149,7 @@ static int find_extent_clone(struct send_ctx *sctx, u64 extent_item_pos; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx backref_ctx; + struct backref_ctx *backref_ctx = NULL; struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; @@ -1159,6 +1159,12 @@ static int find_extent_clone(struct send_ctx *sctx, if (!tmp_path) return -ENOMEM; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); + if (!backref_ctx) { + ret = -ENOMEM; + goto out; + } + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1206,12 +1212,12 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root->found_refs = 0; } - backref_ctx.sctx = sctx; - backref_ctx.found = 0; - backref_ctx.cur_objectid = ino; - backref_ctx.cur_offset = data_offset; - backref_ctx.found_itself = 0; - backref_ctx.extent_len = num_bytes; + backref_ctx->sctx = sctx; + backref_ctx->found = 0; + backref_ctx->cur_objectid = ino; + backref_ctx->cur_offset = data_offset; + backref_ctx->found_itself = 0; + backref_ctx->extent_len = num_bytes; /* * The last extent of a file may be too large due to page alignment. @@ -1219,7 +1225,7 @@ static int find_extent_clone(struct send_ctx *sctx, * __iterate_backrefs work. */ if (data_offset + num_bytes >= ino_size) - backref_ctx.extent_len = ino_size - data_offset; + backref_ctx->extent_len = ino_size - data_offset; /* * Now collect all backrefs. @@ -1227,11 +1233,11 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, - __iterate_backrefs, &backref_ctx); + __iterate_backrefs, backref_ctx); if (ret < 0) goto out; - if (!backref_ctx.found_itself) { + if (!backref_ctx->found_itself) { /* found a bug in backref code? */ ret = -EIO; printk(KERN_ERR "btrfs: ERROR did not find backref in " @@ -1246,7 +1252,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " "num_bytes=%llu, logical=%llu\n", data_offset, ino, num_bytes, logical); - if (!backref_ctx.found) + if (!backref_ctx->found) verbose_printk("btrfs: no clones found\n"); cur_clone_root = NULL; @@ -1271,6 +1277,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " out: btrfs_free_path(tmp_path); + kfree(backref_ctx); return ret; } -- cgit v1.2.3 From 52f9e53ede8e1b261e68216c6c2f32bb3f26c795 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 16:32:45 +0200 Subject: Btrfs: use normal return path for root == send_root case Don't have a seperate return path for the mentioned case. Now we do the same "take lowest inode/offset" logic for all found clones. Commit is a result of Arne's review. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 36711123bdc..d2a4ee9125d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1109,12 +1109,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) return 0; if (offset + ctx->extent_len > ctx->cur_offset) return 0;*/ - - bctx->found++; - found->found_refs++; - found->ino = ino; - found->offset = offset; - return 0; } bctx->found++; -- cgit v1.2.3 From adbe7fb6c4750621a56867d9bb1980da3a4b8f33 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 12:51:32 +0200 Subject: Btrfs: don't break in the final loop of find_extent_clone If we break, we may miss the clone from send_root which we prefer over all other clones. Commit is a result of Arne's review. Reported-by: Arne Jansen Signed-off-by: Alexander Block --- fs/btrfs/send.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d2a4ee9125d..68b2543e5d6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1257,7 +1257,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " else if (sctx->clone_roots[i].root == sctx->send_root) /* prefer clones from send_root over others */ cur_clone_root = sctx->clone_roots + i; - break; } } -- cgit v1.2.3 From 17589bd96eeb7c2ef2d3baeb05005a24932cd1e9 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 14:13:35 +0200 Subject: Btrfs: fix memory leak for name_cache in send/receive When everything is done, name_cache_free is called which however forgot to call kfree on the cache entries. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 68b2543e5d6..9cee678c0fb 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1822,6 +1822,7 @@ static void name_cache_free(struct send_ctx *sctx) list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { name_cache_delete(sctx, nce); + kfree(nce); } } -- cgit v1.2.3 From 7e0926fe5f20b5c88e51cba68512302b10f73d2a Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 14:20:58 +0200 Subject: Btrfs: fix use of radix_tree for name_cache in send/receive We can't easily use the index of the radix tree for inums as the radix tree uses 32bit indexes on 32bit kernels. For 32bit kernels, we now use the lower 32bit of the inum as index and an additional list to store multiple entries per radix tree entry. Reported-by: Arne Jansen Signed-off-by: Alexander Block --- fs/btrfs/send.c | 76 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 39 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9cee678c0fb..02e901adc3e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -125,6 +125,15 @@ struct send_ctx { struct name_cache_entry { struct list_head list; + /* + * radix_tree has only 32bit entries but we need to handle 64bit inums. + * We use the lower 32bit of the 64bit inum to store it in the tree. If + * more then one inum would fall into the same entry, we use radix_list + * to store the additional entries. radix_list is also used to store + * entries where two entries have the same inum but different + * generations. + */ + struct list_head radix_list; u64 ino; u64 gen; u64 parent_ino; @@ -1726,27 +1735,21 @@ static int name_cache_insert(struct send_ctx *sctx, struct name_cache_entry *nce) { int ret = 0; - struct name_cache_entry **ncea; - - ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); - if (ncea) { - if (!ncea[0]) - ncea[0] = nce; - else if (!ncea[1]) - ncea[1] = nce; - else - BUG(); - } else { - ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS); - if (!ncea) + struct list_head *nce_head; + + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + if (!nce_head) { + nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); + if (!nce_head) return -ENOMEM; + INIT_LIST_HEAD(nce_head); - ncea[0] = nce; - ncea[1] = NULL; - ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea); + ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); if (ret < 0) return ret; } + list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -1756,41 +1759,36 @@ static int name_cache_insert(struct send_ctx *sctx, static void name_cache_delete(struct send_ctx *sctx, struct name_cache_entry *nce) { - struct name_cache_entry **ncea; - - ncea = radix_tree_lookup(&sctx->name_cache, nce->ino); - BUG_ON(!ncea); - - if (ncea[0] == nce) - ncea[0] = NULL; - else if (ncea[1] == nce) - ncea[1] = NULL; - else - BUG(); + struct list_head *nce_head; - if (!ncea[0] && !ncea[1]) { - radix_tree_delete(&sctx->name_cache, nce->ino); - kfree(ncea); - } + nce_head = radix_tree_lookup(&sctx->name_cache, + (unsigned long)nce->ino); + BUG_ON(!nce_head); + list_del(&nce->radix_list); list_del(&nce->list); - sctx->name_cache_size--; + + if (list_empty(nce_head)) { + radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); + kfree(nce_head); + } } static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, u64 ino, u64 gen) { - struct name_cache_entry **ncea; + struct list_head *nce_head; + struct name_cache_entry *cur; - ncea = radix_tree_lookup(&sctx->name_cache, ino); - if (!ncea) + nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); + if (!nce_head) return NULL; - if (ncea[0] && ncea[0]->gen == gen) - return ncea[0]; - else if (ncea[1] && ncea[1]->gen == gen) - return ncea[1]; + list_for_each_entry(cur, nce_head, radix_list) { + if (cur->ino == ino && cur->gen == gen) + return cur; + } return NULL; } -- cgit v1.2.3 From 34d73f54e2e2227cece751f168d08d3103092992 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 16:18:58 +0200 Subject: Btrfs: make aux field of ulist 64 bit Btrfs send/receive uses the aux field to store inode numbers. On 32 bit machines this may become a problem. Also fix all users of ulist_add and ulist_add_merged. Reported-by: Arne Jansen Signed-off-by: Alexander Block --- fs/btrfs/backref.c | 8 ++++---- fs/btrfs/qgroup.c | 20 ++++++++++---------- fs/btrfs/ulist.c | 7 +++---- fs/btrfs/ulist.h | 9 ++++----- 4 files changed, 21 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ff6475f409d..6655ca61536 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -231,7 +231,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { ret = ulist_add(parents, eb->start, - (unsigned long)eie, GFP_NOFS); + (u64)eie, GFP_NOFS); if (ret < 0) break; if (!extent_item_pos) { @@ -914,8 +914,8 @@ again: free_extent_buffer(eb); } ret = ulist_add_merge(refs, ref->parent, - (unsigned long)ref->inode_list, - (unsigned long *)&eie, GFP_NOFS); + (u64)ref->inode_list, + (u64 *)&eie, GFP_NOFS); if (!ret && extent_item_pos) { /* * we've recorded that parent, so we must extend @@ -1404,7 +1404,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu, data list " - "%#lx\n", root_node->val, ref_node->val, + "%#llx\n", root_node->val, ref_node->val, ref_node->aux); ret = iterate_leaf_refs( (struct extent_inode_elem *)ref_node->aux, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b6501558174..24f6adcdf33 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1145,7 +1145,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ulist_reinit(tmp); /* XXX id not needed */ - ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC); ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; @@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (unsigned long)glist->group, + (u64)glist->group, GFP_ATOMIC); } } @@ -1168,7 +1168,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, * step 2: walk from the new root */ ulist_reinit(tmp); - ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ulist_add(tmp, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { struct btrfs_qgroup *qg; @@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (unsigned long)glist->group, GFP_ATOMIC); + (u64)glist->group, GFP_ATOMIC); } } @@ -1208,7 +1208,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, continue; ulist_reinit(tmp); - ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC); ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; @@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (unsigned long)glist->group, + (u64)glist->group, GFP_ATOMIC); } } @@ -1469,7 +1469,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * be exceeded */ ulist = ulist_alloc(GFP_ATOMIC); - ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; @@ -1489,7 +1489,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, - (unsigned long)glist->group, GFP_ATOMIC); + (u64)glist->group, GFP_ATOMIC); } } if (ret) @@ -1541,7 +1541,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) goto out; ulist = ulist_alloc(GFP_ATOMIC); - ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; @@ -1553,7 +1553,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, - (unsigned long)glist->group, GFP_ATOMIC); + (u64)glist->group, GFP_ATOMIC); } } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ab942f46b3d..99be4c138db 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free); * In case of allocation failure -ENOMEM is returned and the ulist stays * unaltered. */ -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - gfp_t gfp_mask) +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) { return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); } -int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long *old_aux, gfp_t gfp_mask) +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask) { int i; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 21bdc8ec813..21a1963439c 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -33,7 +33,7 @@ struct ulist_iterator { */ struct ulist_node { u64 val; /* value to store */ - unsigned long aux; /* auxiliary value saved along with the val */ + u64 aux; /* auxiliary value saved along with the val */ }; struct ulist { @@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - gfp_t gfp_mask); -int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long *old_aux, gfp_t gfp_mask); +int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, + u64 *old_aux, gfp_t gfp_mask); struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); -- cgit v1.2.3 From e479d9bb5f8366a064915b3c9ac47ed6e9fcc7d3 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 16:09:35 +0200 Subject: Btrfs: update send_progress at correct places Updating send_progress in process_recorded_refs was not correct. It got updated too early in the cur_inode_new_gen case. Reported-by: Alex Lyakas Reported-by: Arne Jansen Signed-off-by: Alexander Block --- fs/btrfs/send.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 02e901adc3e..f618224d232 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2886,12 +2886,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } - /* - * Current inode is now at it's new position, so we must increase - * send_progress - */ - sctx->send_progress = sctx->cur_ino + 1; - ret = 0; out: @@ -3896,6 +3890,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = process_recorded_refs(sctx); + if (ret < 0) + goto out; + + /* + * We have processed the refs and thus need to advance send_progress. + * Now, calls to get_cur_xxx will take the updated refs of the current + * inode into account. + */ + sctx->send_progress = sctx->cur_ino + 1; out: return ret; @@ -3993,6 +3996,12 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; + + /* + * Set send_progress to current inode. This will tell all get_cur_xxx + * functions that the current inode's refs are not updated yet. Later, + * when process_recorded_refs is finished, it is set to cur_ino + 1. + */ sctx->send_progress = sctx->cur_ino; if (result == BTRFS_COMPARE_TREE_NEW || @@ -4066,6 +4075,11 @@ static int changed_inode(struct send_ctx *sctx, ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); if (ret < 0) goto out; + /* + * Advance send_progress now as we did not get into + * process_recorded_refs_if_needed in the new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; ret = process_all_extents(sctx); if (ret < 0) goto out; -- cgit v1.2.3 From 766702ef49b8b5299d819f3a0ac42613c23424d1 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 14:11:31 +0200 Subject: Btrfs: add/fix comments/documentation for send/receive As the subject already said, add/fix comments. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 134 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f618224d232..6926546939f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1067,6 +1067,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) /* * Called for every backref that is found for the current extent. + * Results are collected in sctx->clone_roots->ino/offset/found_refs */ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) { @@ -1090,7 +1091,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } /* - * There are inodes that have extents that lie behind it's i_size. Don't + * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, @@ -1137,6 +1138,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } /* + * Given an inode, offset and extent item, it finds a good clone for a clone + * instruction. Returns -ENOENT when none could be found. The function makes + * sure that the returned clone is usable at the point where sending is at the + * moment. This means, that no clones are accepted which lie behind the current + * inode+offset. + * * path must point to the extent item when called. */ static int find_extent_clone(struct send_ctx *sctx, @@ -1529,6 +1536,10 @@ out: return ret; } +/* + * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, + * generation of the parent dir and the name of the dir entry. + */ static int get_first_ref(struct send_ctx *sctx, struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) @@ -1615,6 +1626,16 @@ out: return ret; } +/* + * Used by process_recorded_refs to determine if a new ref would overwrite an + * already existing ref. In case it detects an overwrite, it returns the + * inode/gen in who_ino/who_gen. + * When an overwrite is detected, process_recorded_refs does proper orphanizing + * to make sure later references to the overwritten inode are possible. + * Orphanizing is however only required for the first ref of an inode. + * process_recorded_refs does an additional is_first_ref check to see if + * orphanizing is really required. + */ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, const char *name, int name_len, u64 *who_ino, u64 *who_gen) @@ -1639,6 +1660,11 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, goto out; } + /* + * Check if the overwritten ref was already processed. If yes, the ref + * was already unlinked/moved, so we can safely assume that we will not + * overwrite anything at this point in time. + */ if (other_inode > sctx->send_progress) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, who_gen, NULL, NULL, NULL, NULL); @@ -1655,6 +1681,13 @@ out: return ret; } +/* + * Checks if the ref was overwritten by an already processed inode. This is + * used by __get_cur_name_and_parent to find out if the ref was orphanized and + * thus the orphan name needs be used. + * process_recorded_refs also uses it to avoid unlinking of refs that were + * overwritten. + */ static int did_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 ino, u64 ino_gen, @@ -1703,6 +1736,11 @@ out: return ret; } +/* + * Same as did_overwrite_ref, but also checks if it is the first ref of an inode + * that got overwritten. This is used by process_recorded_refs to determine + * if it has to use the path as returned by get_cur_path or the orphan name. + */ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) { int ret = 0; @@ -1731,6 +1769,11 @@ out: return ret; } +/* + * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * so we need to do some special handling in case we have clashes. This function + * takes care of this with the help of name_cache_entry::radix_list. + */ static int name_cache_insert(struct send_ctx *sctx, struct name_cache_entry *nce) { @@ -1792,12 +1835,19 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, return NULL; } +/* + * Removes the entry from the list and adds it back to the end. This marks the + * entry as recently used so that name_cache_clean_unused does not remove it. + */ static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) { list_del(&nce->list); list_add_tail(&nce->list, &sctx->name_cache_list); } +/* + * Remove some entries from the beginning of name_cache_list. + */ static void name_cache_clean_unused(struct send_ctx *sctx) { struct name_cache_entry *nce; @@ -1824,6 +1874,14 @@ static void name_cache_free(struct send_ctx *sctx) } } +/* + * Used by get_cur_path for each ref up to the root. + * Returns 0 if it succeeded. + * Returns 1 if the inode is not existent or got overwritten. In that case, the + * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 + * is returned, parent_ino/parent_gen are not guaranteed to be valid. + * Returns <0 in case of error. + */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, u64 *parent_ino, @@ -1835,6 +1893,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; + /* + * First check if we already did a call to this function with the same + * ino/gen. If yes, check if the cache entry is still up-to-date. If yes + * return the cached result. + */ nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) { @@ -1857,6 +1920,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, if (!path) return -ENOMEM; + /* + * If the inode is not existent yet, add the orphan name and return 1. + * This should only happen for the parent dir that we determine in + * __record_new_ref + */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) goto out; @@ -1869,6 +1937,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } + /* + * Depending on whether the inode was already processed or not, use + * send_root or parent_root for ref lookup. + */ if (ino < sctx->send_progress) ret = get_first_ref(sctx, sctx->send_root, ino, parent_ino, parent_gen, dest); @@ -1878,6 +1950,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, if (ret < 0) goto out; + /* + * Check if the ref was overwritten by an inode's ref that was processed + * earlier. If yes, treat as orphan and return 1. + */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, dest->start, dest->end - dest->start); if (ret < 0) @@ -1891,6 +1967,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, } out_cache: + /* + * Store the result of the lookup in the name cache. + */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS); if (!nce) { ret = -ENOMEM; @@ -2278,7 +2357,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); btrfs_inode_mtime(ii)); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, btrfs_inode_ctime(ii)); - /* TODO otime? */ + /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); @@ -2520,7 +2599,7 @@ static void free_recorded_refs(struct send_ctx *sctx) } /* - * Renames/moves a file/dir to it's orphan name. Used when the first + * Renames/moves a file/dir to its orphan name. Used when the first * ref of an unprocessed inode gets overwritten and for all non empty * directories. */ @@ -2840,7 +2919,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * If the inode is still orphan, unlink the orphan. This may * happen when a previous inode did overwrite the first ref * of this inode and no new refs were added for the current - * inode. + * inode. Unlinking does not mean that the inode is deleted in + * all cases. There may still be links to this inode in other + * places. */ if (is_orphan) { ret = send_unlink(sctx, valid_path); @@ -2857,6 +2938,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ ULIST_ITER_INIT(&uit); while ((un = ulist_next(check_dirs, &uit))) { + /* + * In case we had refs into dirs that were not processed yet, + * we don't need to do the utime and rmdir logic for these dirs. + * The dir will be processed later. + */ if (un->val > sctx->cur_ino) continue; @@ -4048,7 +4134,17 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + /* + * We need to do some special handling in case the inode was + * reported as changed with a changed generation number. This + * means that the original inode was deleted and new inode + * reused the same inum. So we have to treat the old inode as + * deleted and the new one as new. + */ if (sctx->cur_inode_new_gen) { + /* + * First, process the inode as if it was deleted. + */ sctx->cur_inode_gen = right_gen; sctx->cur_inode_new = 0; sctx->cur_inode_deleted = 1; @@ -4061,6 +4157,9 @@ static int changed_inode(struct send_ctx *sctx, if (ret < 0) goto out; + /* + * Now process the inode as if it was new. + */ sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = 1; sctx->cur_inode_deleted = 0; @@ -4080,6 +4179,11 @@ static int changed_inode(struct send_ctx *sctx, * process_recorded_refs_if_needed in the new_gen case. */ sctx->send_progress = sctx->cur_ino + 1; + + /* + * Now process all extents and xattrs of the inode as if + * they were all new. + */ ret = process_all_extents(sctx); if (ret < 0) goto out; @@ -4102,6 +4206,16 @@ out: return ret; } +/* + * We have to process new refs before deleted refs, but compare_trees gives us + * the new and deleted refs mixed. To fix this, we record the new/deleted refs + * first and later process them in process_recorded_refs. + * For the cur_inode_new_gen case, we skip recording completely because + * changed_inode did already initiate processing of refs. The reason for this is + * that in this case, compare_tree actually compares the refs of 2 different + * inodes. To fix this, process_all_refs is used in changed_inode to handle all + * refs of the right tree as deleted and all refs of the left tree as new. + */ static int changed_ref(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -4122,6 +4236,11 @@ static int changed_ref(struct send_ctx *sctx, return ret; } +/* + * Process new/deleted/changed xattrs. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of xattrs. The reason is the same as in changed_ref + */ static int changed_xattr(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -4141,6 +4260,11 @@ static int changed_xattr(struct send_ctx *sctx, return ret; } +/* + * Process new/deleted/changed extents. We skip processing in the + * cur_inode_new_gen case because changed_inode did already initiate processing + * of extents. The reason is the same as in changed_ref + */ static int changed_extent(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -4157,7 +4281,10 @@ static int changed_extent(struct send_ctx *sctx, return ret; } - +/* + * Updates compare related fields in sctx and simply forwards to the actual + * changed_xxx functions. + */ static int changed_cb(struct btrfs_root *left_root, struct btrfs_root *right_root, struct btrfs_path *left_path, @@ -4229,7 +4356,8 @@ join_trans: } /* - * Make sure the tree has not changed + * Make sure the tree has not changed after re-joining. We detect this + * by comparing start_ctransid and ctransid. They should always match. */ spin_lock(&send_root->root_times_lock); ctransid = btrfs_root_ctransid(&send_root->root_item); -- cgit v1.2.3 From e938c8ad543cd5ffe344371747484303ad51dfba Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Sat, 28 Jul 2012 16:33:49 +0200 Subject: Btrfs: code cleanups for send/receive Doing some code cleanups as suggested by Arne. Changes do not change any logic. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 83 ++++++++++++++++++++++++--------------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6926546939f..3bc921aa5e2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1115,10 +1115,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (ino >= bctx->cur_objectid) return 0; - /*if (ino > ctx->cur_objectid) +#if 0 + if (ino > bctx->cur_objectid) + return 0; + if (offset + bctx->extent_len > bctx->cur_offset) return 0; - if (offset + ctx->extent_len > ctx->cur_offset) - return 0;*/ +#endif } bctx->found++; @@ -1327,8 +1329,6 @@ static int read_symlink(struct send_ctx *sctx, len = btrfs_file_extent_inline_len(path->nodes[0], ei); ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); - if (ret < 0) - goto out; out: btrfs_free_path(path); @@ -1440,9 +1440,9 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) } if (!left_ret && !right_ret) { - if (left_gen == gen && right_gen == gen) + if (left_gen == gen && right_gen == gen) { ret = inode_state_no_change; - else if (left_gen == gen) { + } else if (left_gen == gen) { if (ino < sctx->send_progress) ret = inode_state_did_create; else @@ -1615,11 +1615,7 @@ static int is_first_ref(struct send_ctx *sctx, goto out; } - ret = memcmp(tmp_name->start, name, name_len); - if (ret) - ret = 0; - else - ret = 1; + ret = !memcmp(tmp_name->start, name, name_len); out: fs_path_free(sctx, tmp_name); @@ -1761,8 +1757,6 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, name->start, fs_path_len(name)); - if (ret < 0) - goto out; out: fs_path_free(sctx, name); @@ -1866,9 +1860,10 @@ static void name_cache_clean_unused(struct send_ctx *sctx) static void name_cache_free(struct send_ctx *sctx) { struct name_cache_entry *nce; - struct name_cache_entry *tmp; - list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) { + while (!list_empty(&sctx->name_cache_list)) { + nce = list_entry(sctx->name_cache_list.next, + struct name_cache_entry, list); name_cache_delete(sctx, nce); kfree(nce); } @@ -2188,9 +2183,6 @@ static int send_subvol_begin(struct send_ctx *sctx) read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); btrfs_release_path(path); - if (ret < 0) - goto out; - if (parent_root) { ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); if (ret < 0) @@ -2393,19 +2385,19 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (ret < 0) goto out; - if (S_ISREG(mode)) + if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; - else if (S_ISDIR(mode)) + } else if (S_ISDIR(mode)) { cmd = BTRFS_SEND_C_MKDIR; - else if (S_ISLNK(mode)) + } else if (S_ISLNK(mode)) { cmd = BTRFS_SEND_C_SYMLINK; - else if (S_ISCHR(mode) || S_ISBLK(mode)) + } else if (S_ISCHR(mode) || S_ISBLK(mode)) { cmd = BTRFS_SEND_C_MKNOD; - else if (S_ISFIFO(mode)) + } else if (S_ISFIFO(mode)) { cmd = BTRFS_SEND_C_MKFIFO; - else if (S_ISSOCK(mode)) + } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; - else { + } else { printk(KERN_WARNING "btrfs: unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; @@ -2583,13 +2575,13 @@ static int record_ref(struct list_head *head, u64 dir, static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) { struct recorded_ref *cur; - struct recorded_ref *tmp; - list_for_each_entry_safe(cur, tmp, head, list) { + while (!list_empty(head)) { + cur = list_entry(head->next, struct recorded_ref, list); fs_path_free(sctx, cur->full_path); + list_del(&cur->list); kfree(cur); } - INIT_LIST_HEAD(head); } static void free_recorded_refs(struct send_ctx *sctx) @@ -3205,24 +3197,18 @@ static int process_all_refs(struct send_ctx *sctx, key.offset = 0; while (1) { ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) { - btrfs_release_path(path); + if (ret < 0) goto out; - } - if (ret) { - btrfs_release_path(path); + if (ret) break; - } eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) { - btrfs_release_path(path); + found_key.type != key.type) break; - } ret = iterate_inode_ref(sctx, sctx->parent_root, path, &found_key, 0, cb, sctx); @@ -3232,6 +3218,7 @@ static int process_all_refs(struct send_ctx *sctx, key.offset = found_key.offset + 1; } + btrfs_release_path(path); ret = process_recorded_refs(sctx); @@ -3554,7 +3541,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int ret = 0; struct fs_path *p; loff_t pos = offset; - int readed = 0; + int num_read = 0; mm_segment_t old_fs; p = fs_path_alloc(sctx); @@ -3579,8 +3566,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); if (ret < 0) goto out; - readed = ret; - if (!readed) + num_read = ret; + if (!num_read) goto out; ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); @@ -3593,7 +3580,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); ret = send_cmd(sctx); @@ -3603,7 +3590,7 @@ out: set_fs(old_fs); if (ret < 0) return ret; - return readed; + return num_read; } /* @@ -3614,7 +3601,6 @@ static int send_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_root *clone_root2 = clone_root->root; struct fs_path *p; u64 gen; @@ -3639,22 +3625,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - if (clone_root2 == sctx->send_root) { + if (clone_root->root == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { - ret = get_inode_path(sctx, clone_root2, clone_root->ino, p); + ret = get_inode_path(sctx, clone_root->root, + clone_root->ino, p); } if (ret < 0) goto out; TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root2->root_item.uuid); + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - clone_root2->root_item.ctransid); + clone_root->root->root_item.ctransid); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); -- cgit v1.2.3 From 3e126f32f88095ad1bd01b3c451e52aa9094f45c Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 12:03:09 +0200 Subject: Btrfs: remove unused tmp_path from iterate_dir_item A leftover from older code and unused now. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3bc921aa5e2..b0f9df30f24 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -864,7 +864,6 @@ static int iterate_dir_item(struct send_ctx *sctx, struct extent_buffer *eb; struct btrfs_item *item; struct btrfs_dir_item *di; - struct btrfs_path *tmp_path = NULL; struct btrfs_key di_key; char *buf = NULL; char *buf2 = NULL; @@ -886,12 +885,6 @@ static int iterate_dir_item(struct send_ctx *sctx, goto out; } - tmp_path = alloc_path_for_send(); - if (!tmp_path) { - ret = -ENOMEM; - goto out; - } - eb = path->nodes[0]; slot = path->slots[0]; item = btrfs_item_nr(eb, slot); @@ -953,7 +946,6 @@ static int iterate_dir_item(struct send_ctx *sctx, } out: - btrfs_free_path(tmp_path); if (buf_virtual) vfree(buf); else -- cgit v1.2.3 From 5dc67d0ba915cda01ed3a980502945edf2c46b70 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 12:07:43 +0200 Subject: Btrfs: free nce and nce_head on error in name_cache_insert Both were leaked in case of error. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b0f9df30f24..cfbe987f854 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1759,6 +1759,7 @@ out: * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, * so we need to do some special handling in case we have clashes. This function * takes care of this with the help of name_cache_entry::radix_list. + * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, struct name_cache_entry *nce) @@ -1775,8 +1776,11 @@ static int name_cache_insert(struct send_ctx *sctx, INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); - if (ret < 0) + if (ret < 0) { + kfree(nce_head); + kfree(nce); return ret; + } } list_add_tail(&nce->radix_list, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); -- cgit v1.2.3 From 3954096d4b1adab6fbb3c26de37ddd4f781e072f Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 12:46:05 +0200 Subject: Btrfs: fix check for changed extent in is_extent_unchanged The previous check was working fine, but this check should be easier to read. Also, we could theoritically have some exotic bugs with the previous checks. Signed-off-by: Alexander Block --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index cfbe987f854..35222d5420f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3819,8 +3819,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, /* * Check if we have the same extent. */ - if (left_disknr + left_offset_fixed != - right_disknr + right_offset) { + if (left_disknr != right_disknr || + left_offset_fixed != right_offset) { ret = 0; goto out; } -- cgit v1.2.3 From d8347fa444c682dc8a1e24e7158bfa2dd4c9ca71 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 12:49:15 +0200 Subject: Btrfs: use <= instead of < in is_extent_unchanged Used the wrong compare operator here. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 35222d5420f..c6c10a43153 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3802,7 +3802,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Are we at extent 8? If yes, we know the extent is changed. * This may only happen on the first iteration. */ - if (found_key.offset + right_len < ekey->offset) { + if (found_key.offset + right_len <= ekey->offset) { ret = 0; goto out; } -- cgit v1.2.3 From 2f28f4787c0fc37c508cfb6b7b11c00ce5072940 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 14:42:14 +0200 Subject: Btrfs: pass root instead of parent_root to iterate_inode_ref We need to pass the root that we determined earlier to iterate_inode_ref. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c6c10a43153..db051d17b0b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3206,8 +3206,8 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != key.type) break; - ret = iterate_inode_ref(sctx, sctx->parent_root, path, - &found_key, 0, cb, sctx); + ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, + sctx); btrfs_release_path(path); if (ret < 0) goto out; -- cgit v1.2.3 From 2981e225f7048b36470383bf5622c42139bd96f7 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 14:47:03 +0200 Subject: Btrfs: ignore non-FS inodes for send/receive We have to ignore inode/space cache objects in send/receive. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index db051d17b0b..a4011a9148f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4287,6 +4287,11 @@ static int changed_cb(struct btrfs_root *left_root, if (ret < 0) goto out; + /* Ignore non-FS objects */ + if (key->objectid == BTRFS_FREE_INO_OBJECTID || + key->objectid == BTRFS_FREE_SPACE_OBJECTID) + goto out; + if (key->type == BTRFS_INODE_ITEM_KEY) ret = changed_inode(sctx, result); else if (key->type == BTRFS_INODE_REF_KEY) -- cgit v1.2.3 From 6d85ed05e16e7ff747025c8374f23d7d81c98540 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Wed, 1 Aug 2012 14:48:59 +0200 Subject: Btrfs: don't treat top/root directory inode as deleted/reused We can't do the deleted/reused logic for top/root inodes as it would create a stream that tries to delete and recreate the root dir. Reported-by: Alex Lyakas Signed-off-by: Alexander Block --- fs/btrfs/send.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a4011a9148f..d17d75ebc48 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2627,6 +2627,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) struct btrfs_key loc; struct btrfs_dir_item *di; + /* + * Don't try to rmdir the top/root subvolume dir. + */ + if (dir == BTRFS_FIRST_FREE_OBJECTID) + return 0; + path = alloc_path_for_send(); if (!path) return -ENOMEM; @@ -2687,6 +2693,12 @@ static int process_recorded_refs(struct send_ctx *sctx) verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); + /* + * This should never happen as the root dir always has the same ref + * which is always '..' + */ + BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + valid_path = fs_path_alloc(sctx); if (!valid_path) { ret = -ENOMEM; @@ -4094,7 +4106,14 @@ static int changed_inode(struct send_ctx *sctx, right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], right_ii); - if (left_gen != right_gen) + + /* + * The cur_ino = root dir case is special here. We can't treat + * the inode as deleted+reused because it would generate a + * stream that tries to delete/mkdir the root dir. + */ + if (left_gen != right_gen && + sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) sctx->cur_inode_new_gen = 1; } -- cgit v1.2.3 From 74dd17fbe3d65829e75d84f00a9525b2ace93998 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 7 Aug 2012 16:25:13 -0400 Subject: Btrfs: fix btrfs send for inline items and compression The btrfs send code was assuming the offset of the file item into the extent translated to bytes on disk. If we're compressed, this isn't true, and so it was off into extents owned by other files. It was also improperly handling inline extents. This solves a crash where we may have gone past the end of the file extent item by not testing early enough for an inline extent. It also solves problems where we have a whole between the end of the inline item and the start of the full extent. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 5 ++++- fs/btrfs/extent_io.c | 1 - fs/btrfs/send.c | 46 +++++++++++++++++++++++++++++++++------------- 3 files changed, 37 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d183f60d63..fcc8c21a623 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5073,6 +5073,7 @@ static void tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + BUG_ON(*level == 0); path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], path->slots[*level]); path->slots[*level - 1] = 0; @@ -5089,7 +5090,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root, path->slots[*level]++; - while (path->slots[*level] == nritems) { + while (path->slots[*level] >= nritems) { if (*level == root_level) return -1; @@ -5433,9 +5434,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_right = ADVANCE; } else { + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); if (ret) { + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = changed_cb(left_root, right_root, left_path, right_path, &left_key, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c878476bb9..19319f5a91a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4332,7 +4332,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb, 0); - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d17d75ebc48..f22fdd41ff4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1149,6 +1149,7 @@ static int find_extent_clone(struct send_ctx *sctx, int ret; int extent_type; u64 logical; + u64 disk_byte; u64 num_bytes; u64 extent_item_pos; struct btrfs_file_extent_item *fi; @@ -1157,6 +1158,7 @@ static int find_extent_clone(struct send_ctx *sctx, struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; + int compressed; u32 i; tmp_path = alloc_path_for_send(); @@ -1186,17 +1188,18 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -ENOENT; goto out; } + compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); - logical = btrfs_file_extent_disk_bytenr(eb, fi); - if (logical == 0) { + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == 0) { ret = -ENOENT; goto out; } - logical += btrfs_file_extent_offset(eb, fi); + logical = disk_byte + btrfs_file_extent_offset(eb, fi); ret = extent_from_logical(sctx->send_root->fs_info, - logical, tmp_path, &found_key); + disk_byte, tmp_path, &found_key); btrfs_release_path(tmp_path); if (ret < 0) @@ -1234,10 +1237,16 @@ static int find_extent_clone(struct send_ctx *sctx, /* * Now collect all backrefs. */ + if (compressed == BTRFS_COMPRESS_NONE) + extent_item_pos = logical - found_key.objectid; + else + extent_item_pos = 0; + extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, backref_ctx); + if (ret < 0) goto out; @@ -1246,8 +1255,8 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -EIO; printk(KERN_ERR "btrfs: ERROR did not find backref in " "send_root. inode=%llu, offset=%llu, " - "logical=%llu\n", - ino, data_offset, logical); + "disk_byte=%llu found extent=%llu\n", + ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -3678,10 +3687,17 @@ static int send_write_or_clone(struct send_ctx *sctx, ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) + if (type == BTRFS_FILE_EXTENT_INLINE) { len = btrfs_file_extent_inline_len(path->nodes[0], ei); - else + /* + * it is possible the inline item won't cover the whole page, + * but there may be items after this page. Make + * sure to send the whole thing + */ + len = PAGE_CACHE_ALIGN(len); + } else { len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + } if (offset + len > sctx->cur_inode_size) len = sctx->cur_inode_size - offset; @@ -3729,6 +3745,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, u64 left_offset_fixed; u64 left_len; u64 right_len; + u64 left_gen; + u64 right_gen; u8 left_type; u8 right_type; @@ -3738,17 +3756,17 @@ static int is_extent_unchanged(struct send_ctx *sctx, eb = left_path->nodes[0]; slot = left_path->slots[0]; - ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); left_type = btrfs_file_extent_type(eb, ei); - left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - left_len = btrfs_file_extent_num_bytes(eb, ei); - left_offset = btrfs_file_extent_offset(eb, ei); if (left_type != BTRFS_FILE_EXTENT_REG) { ret = 0; goto out; } + left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); + left_len = btrfs_file_extent_num_bytes(eb, ei); + left_offset = btrfs_file_extent_offset(eb, ei); + left_gen = btrfs_file_extent_generation(eb, ei); /* * Following comments will refer to these graphics. L is the left @@ -3804,6 +3822,7 @@ static int is_extent_unchanged(struct send_ctx *sctx, right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); right_len = btrfs_file_extent_num_bytes(eb, ei); right_offset = btrfs_file_extent_offset(eb, ei); + right_gen = btrfs_file_extent_generation(eb, ei); if (right_type != BTRFS_FILE_EXTENT_REG) { ret = 0; @@ -3832,7 +3851,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * Check if we have the same extent. */ if (left_disknr != right_disknr || - left_offset_fixed != right_offset) { + left_offset_fixed != right_offset || + left_gen != right_gen) { ret = 0; goto out; } -- cgit v1.2.3 From 995e01b7af745b8aaa5e882cfb7bfd5baab3f335 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Aug 2012 02:52:38 -0600 Subject: Btrfs: fix gcc warnings for 32bit compiles Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 25 +++++++++++++------------ fs/btrfs/file-item.c | 2 +- fs/btrfs/qgroup.c | 32 ++++++++++++++++---------------- fs/btrfs/send.c | 4 ++-- 4 files changed, 32 insertions(+), 31 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6655ca61536..e600857d3ca 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -231,7 +231,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { ret = ulist_add(parents, eb->start, - (u64)eie, GFP_NOFS); + (uintptr_t)eie, GFP_NOFS); if (ret < 0) break; if (!extent_item_pos) { @@ -363,8 +363,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; - ref->inode_list = - node ? (struct extent_inode_elem *)node->aux : 0; + ref->inode_list = node ? + (struct extent_inode_elem *)(uintptr_t)node->aux : 0; /* additional parents require new refs being added here */ while ((node = ulist_next(parents, &uiter))) { @@ -375,8 +375,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; - new_ref->inode_list = - (struct extent_inode_elem *)node->aux; + new_ref->inode_list = (struct extent_inode_elem *) + (uintptr_t)node->aux; list_add(&new_ref->list, &ref->list); } ulist_reinit(parents); @@ -914,7 +914,7 @@ again: free_extent_buffer(eb); } ret = ulist_add_merge(refs, ref->parent, - (u64)ref->inode_list, + (uintptr_t)ref->inode_list, (u64 *)&eie, GFP_NOFS); if (!ret && extent_item_pos) { /* @@ -959,7 +959,7 @@ static void free_leaf_list(struct ulist *blocks) while ((node = ulist_next(blocks, &uiter))) { if (!node->aux) continue; - eie = (struct extent_inode_elem *)node->aux; + eie = (struct extent_inode_elem *)(uintptr_t)node->aux; for (; eie; eie = eie_next) { eie_next = eie->next; kfree(eie); @@ -1405,11 +1405,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu, data list " "%#llx\n", root_node->val, ref_node->val, - ref_node->aux); - ret = iterate_leaf_refs( - (struct extent_inode_elem *)ref_node->aux, - root_node->val, extent_item_objectid, - iterate, ctx); + (long long)ref_node->aux); + ret = iterate_leaf_refs((struct extent_inode_elem *) + (uintptr_t)ref_node->aux, + root_node->val, + extent_item_objectid, + iterate, ctx); } ulist_free(roots); roots = NULL; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 857d93cd01d..54f03354bed 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,7 +25,7 @@ #include "transaction.h" #include "print-tree.h" -#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 24f6adcdf33..9b707ba5c6c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ulist_reinit(tmp); /* XXX id not needed */ - ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC); + ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC); ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)tmp_unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; if (qg->refcnt < seq) qg->refcnt = seq + 1; else @@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (u64)glist->group, + (u64)(uintptr_t)glist->group, GFP_ATOMIC); } } @@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, * step 2: walk from the new root */ ulist_reinit(tmp); - ulist_add(tmp, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); + ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; if (qg->refcnt < seq) { /* not visited by step 1 */ qg->rfer += sgn * node->num_bytes; @@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (u64)glist->group, GFP_ATOMIC); + (uintptr_t)glist->group, GFP_ATOMIC); } } @@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, continue; ulist_reinit(tmp); - ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC); + ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)tmp_unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; if (qg->tag == seq) continue; @@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(tmp, glist->group->qgroupid, - (u64)glist->group, + (uintptr_t)glist->group, GFP_ATOMIC); } } @@ -1469,13 +1469,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * be exceeded */ ulist = ulist_alloc(GFP_ATOMIC); - ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + qg->rfer + num_bytes > @@ -1489,7 +1489,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, - (u64)glist->group, GFP_ATOMIC); + (uintptr_t)glist->group, GFP_ATOMIC); } } if (ret) @@ -1502,7 +1502,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; qg->reserved += num_bytes; } @@ -1541,19 +1541,19 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) goto out; ulist = ulist_alloc(GFP_ATOMIC); - ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)unode->aux; + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, - (u64)glist->group, GFP_ATOMIC); + (uintptr_t)glist->group, GFP_ATOMIC); } } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f22fdd41ff4..e5c867996aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1035,7 +1035,7 @@ struct backref_ctx { static int __clone_root_cmp_bsearch(const void *key, const void *elt) { - u64 root = (u64)key; + u64 root = (u64)(uintptr_t)key; struct clone_root *cr = (struct clone_root *)elt; if (root < cr->root->objectid) @@ -1069,7 +1069,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) u64 i_size; /* First check if the root is in the list of accepted clone sources */ - found = bsearch((void *)root, bctx->sctx->clone_roots, + found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, bctx->sctx->clone_roots_cnt, sizeof(struct clone_root), __clone_root_cmp_bsearch); -- cgit v1.2.3 From 7c735313bd1277c2eb28421934d4c7a0fa7339f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 13 Aug 2012 15:43:26 -0400 Subject: Btrfs: update last trans if we don't update the inode There is a completely impossible situation to hit where you can preallocate a file, fsync it, write into the preallocated region, have the transaction commit twice and then fsync and then immediately lose power and lose all of the contents of the write. This patch fixes this just so I feel better about the situation and because it is lightweight, we just update the last_trans when we finish an ordered IO and we don't update the inode itself. This way we are completely safe and I feel better. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec154f95464..6971bac66d9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1949,6 +1949,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_abort_transaction(trans, root, ret); goto out_unlock; } + } else { + btrfs_set_inode_last_trans(trans, inode); } ret = 0; out_unlock: -- cgit v1.2.3 From 54338b5cc4fa3cfe260e8e4ade8b62a9079ea3f9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Aug 2012 16:20:52 -0400 Subject: Btrfs: do not allocate chunks as agressively Swinging this pendulum back the other way. We've been allocating chunks up to 2% of the disk no matter how much we actually have allocated. So instead fix this calculation to only allocate chunks if we have more than 80% of the space available allocated. Please test this as it will likely cause all sorts of ENOSPC problems to pop up suddenly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ba58024d40d..62fc92f2b9d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3504,7 +3504,8 @@ static int should_alloc_chunk(struct btrfs_root *root, * and purposes it's used space. Don't worry about locking the * global_rsv, it doesn't change except when the transaction commits. */ - num_allocated += global_rsv->size; + if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) + num_allocated += global_rsv->size; /* * in limited mode, we want to have some free space up to @@ -3518,15 +3519,8 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 2% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - /* system chunks need a much small threshold */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) - thresh = 32 * 1024 * 1024; - - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) + if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; return 1; } -- cgit v1.2.3 From 224ecce517af3a952321202cdf304c12e138caca Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 16 Aug 2012 16:32:06 -0400 Subject: Btrfs: fix possible corruption when fsyncing written prealloced extents While working on my fsync patch my fsync tester kept hitting mismatching md5sums when I would randomly write to a prealloc'ed region, syncfs() and then write to the prealloced region some more and then fsync() and then immediately reboot. This is because the tree logging code will skip writing csums for file extents who's generation is less than the current running transaction. When we mark extents as written we haven't been updating their generation so they were always being skipped. This wouldn't happen if you were to preallocate and then write in the same transaction, but if you for example prealloced a VM you could definitely run into this problem. This patch makes my fsync tester happy again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285c6e4..b7c885c8423 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -935,12 +935,16 @@ again: btrfs_set_item_key_safe(trans, root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_set_file_extent_offset(leaf, fi, end - orig_offset); fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); btrfs_mark_buffer_dirty(leaf); @@ -958,12 +962,16 @@ again: struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); path->slots[0]++; new_key.offset = start; btrfs_set_item_key_safe(trans, root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - start); btrfs_set_file_extent_offset(leaf, fi, @@ -991,12 +999,14 @@ again: leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); @@ -1056,12 +1066,14 @@ again: struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_mark_buffer_dirty(leaf); -- cgit v1.2.3 From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Aug 2012 13:14:17 -0400 Subject: Btrfs: turbo charge fsync At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/ctree.h | 12 ++- fs/btrfs/extent_map.c | 34 +++++++- fs/btrfs/extent_map.h | 5 +- fs/btrfs/file.c | 62 +++++++++----- fs/btrfs/inode.c | 120 ++++++++++++++++++++++++--- fs/btrfs/ioctl.c | 4 +- fs/btrfs/tree-log.c | 220 +++++++++++++++++++++++++++++++++++++++++++++++-- 8 files changed, 416 insertions(+), 42 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5b2ad6bc4fe..7c7bf818f3c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -38,6 +38,7 @@ #define BTRFS_INODE_DELALLOC_META_RESERVED 4 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 +#define BTRFS_INODE_NEEDS_FULL_SYNC 7 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d195b50766..4b81ea3fa1b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3315,9 +3315,17 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); +int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, + u64 start, u64 end, int skip_pinned, + int modified); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache); +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *hint_byte, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, u64 *hint_byte, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7c97b330145..1fe82cfc1d9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -35,6 +35,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree) { tree->map = RB_ROOT; + INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void) em->in_tree = 0; em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = 0; atomic_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); return em; } @@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); + WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } } @@ -198,6 +202,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; + if (merge->generation > em->generation) { + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + + list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); free_extent_map(merge); } @@ -211,11 +221,29 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; + if (merge->generation > em->generation) { + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + list_del_init(&merge->list); free_extent_map(merge); } } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +/** + * unpint_extent_cache - unpin an extent from the cache + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * @prealloc: if this is set we need to clear the prealloc flag + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) { int ret = 0; struct extent_map *em; @@ -228,10 +256,11 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) if (!em) goto out; + list_move(&em->list, &tree->modified_extents); + em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); try_merge_map(tree, em); - free_extent_map(em); out: write_unlock(&tree->lock); @@ -358,6 +387,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); + list_del_init(&em->list); em->in_tree = 0; return ret; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 1195f09761f..2388a60bd6e 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -23,15 +23,18 @@ struct extent_map { u64 orig_start; u64 block_start; u64 block_len; + u64 generation; unsigned long flags; struct block_device *bdev; atomic_t refs; unsigned int in_tree; unsigned int compress_type; + struct list_head list; }; struct extent_map_tree { struct rb_root map; + struct list_head modified_extents; rwlock_t lock; }; @@ -60,7 +63,7 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b7c885c8423..399f9d71a92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -459,13 +459,14 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, * [start, end]. Existing extents are split as required. */ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) + int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; struct extent_map *split2 = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 len = end - start + 1; + u64 gen; int ret; int testend = 1; unsigned long flags; @@ -490,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, break; } flags = em->flags; + gen = em->generation; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { if (testend && em->start + em->len >= start + len) { free_extent_map(em); @@ -518,12 +520,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; - + split->generation = gen; split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); /* Logic error */ + list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = split2; split2 = NULL; @@ -537,6 +540,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; + split->generation = gen; if (compressed) { split->block_len = em->block_len; @@ -550,6 +554,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, ret = add_extent_mapping(em_tree, split); BUG_ON(ret); /* Logic error */ + list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = NULL; } @@ -576,13 +581,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. */ -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache) +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *hint_byte, int drop_cache) { - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_path *path; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -597,14 +602,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int recow; int ret; int modify_tree = -1; + int update_refs = (root->ref_cows || root == root->fs_info->tree_root); if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (start >= BTRFS_I(inode)->disk_i_size) modify_tree = 0; @@ -707,7 +709,7 @@ next_slot: extent_end - start); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, @@ -734,7 +736,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { inode_sub_bytes(inode, end - key.offset); *hint_byte = disk_bytenr; } @@ -753,7 +755,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) { inode_sub_bytes(inode, extent_end - start); *hint_byte = disk_bytenr; } @@ -777,12 +779,13 @@ next_slot: del_nr++; } - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + if (update_refs && + extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, extent_end - key.offset); extent_end = ALIGN(extent_end, root->sectorsize); - } else if (disk_bytenr > 0) { + } else if (update_refs && disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, @@ -806,7 +809,7 @@ next_slot: del_nr); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto out; + break; } del_nr = 0; @@ -825,7 +828,22 @@ next_slot: btrfs_abort_transaction(trans, root, ret); } -out: + btrfs_release_path(path); + return ret; +} + +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, u64 *hint_byte, int drop_cache) +{ + struct btrfs_path *path; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, + hint_byte, drop_cache); btrfs_free_path(path); return ret; } @@ -892,8 +910,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int ret; u64 ino = btrfs_ino(inode); - btrfs_drop_extent_cache(inode, start, end - 1, 0); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1556,6 +1572,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; + + /* + * We'v had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever + * reason, it's no longer relevant. + */ + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); mutex_unlock(&inode->i_mutex); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6971bac66d9..1b99fe8a129 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -247,7 +247,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, inode, start, aligned_end, + ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, &hint_byte, 1); if (ret) return ret; @@ -1803,7 +1803,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, &hint, 0); if (ret) goto out; @@ -1929,11 +1930,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, - ordered_extent->len); } - + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, ordered_extent->len, + trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, root, ret); goto out_unlock; @@ -2592,6 +2592,18 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + + /* + * If we were modified in the current generation and evicted from memory + * and then re-read we need to do a full sync since we don't have any + * idea about which extents were modified before we were evicted from + * cache. + */ + if (BTRFS_I(inode)->last_trans == root->fs_info->generation) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; @@ -3269,8 +3281,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = -1; + /* + * We want to drop from the next block forward in case this new size is + * not block aligned since we will be keeping the last block of the + * extent just the way it is. + */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -3579,6 +3596,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 mask = root->sectorsize - 1; u64 hole_start = (oldsize + mask) & ~mask; u64 block_end = (size + mask) & ~mask; @@ -3615,6 +3633,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + struct extent_map *hole_em; u64 hint_byte = 0; hole_size = last_byte - cur_offset; @@ -3624,7 +3643,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } - err = btrfs_drop_extents(trans, inode, cur_offset, + err = btrfs_drop_extents(trans, root, inode, + cur_offset, cur_offset + hole_size, &hint_byte, 1); if (err) { @@ -3643,9 +3663,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } - btrfs_drop_extent_cache(inode, hole_start, - last_byte - 1, 0); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + hole_size - 1, 0); + hole_em = alloc_extent_map(); + if (!hole_em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + hole_em->start = cur_offset; + hole_em->len = hole_size; + hole_em->orig_start = cur_offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + while (1) { + write_lock(&em_tree->lock); + err = add_extent_mapping(em_tree, hole_em); + if (!err) + list_move(&hole_em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + if (err != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + + hole_size - 1, 0); + } + free_extent_map(hole_em); +next: btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); } @@ -4673,6 +4723,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; + /* + * We could have gotten an inode number from somebody who was fsynced + * and then removed in this same transaction, so let's just set full + * sync since it will be a full sync anyway and this will blow away the + * old info in the log. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + if (S_ISDIR(mode)) owner = 0; else @@ -6839,6 +6897,15 @@ static int btrfs_truncate(struct inode *inode) &BTRFS_I(inode)->runtime_flags)) btrfs_add_ordered_operation(trans, root, inode); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + */ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + while (1) { ret = btrfs_block_rsv_refill(root, rsv, min_size); if (ret) { @@ -7510,6 +7577,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; @@ -7550,6 +7619,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + goto next; + } + + em->start = cur_offset; + em->orig_start = cur_offset; + em->len = ins.offset; + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->generation = trans->transid; + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset - 1, + 0); + } + free_extent_map(em); +next: num_bytes -= ins.offset; cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9df50fa8a07..95223222d5a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2576,7 +2576,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datal -= off - key.offset; } - ret = btrfs_drop_extents(trans, inode, + ret = btrfs_drop_extents(trans, root, inode, new_key.offset, new_key.offset + datal, &hint_byte, 1); @@ -2650,7 +2650,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, size -= skip + trim; datal -= skip + trim; - ret = btrfs_drop_extents(trans, inode, + ret = btrfs_drop_extents(trans, root, inode, new_key.offset, new_key.offset + datal, &hint_byte, 1); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c86670f4f28..f2ff02c5513 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,7 @@ #include #include +#include #include "ctree.h" #include "transaction.h" #include "disk-io.h" @@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, inode, start, extent_end, + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, &alloc_hint, 1); BUG_ON(ret); @@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } +static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct extent_map *em1, *em2; + + em1 = list_entry(a, struct extent_map, list); + em2 = list_entry(b, struct extent_map, list); + + if (em1->start < em2->start) + return -1; + else if (em1->start > em2->start) + return 1; + return 0; +} + +struct log_args { + struct extent_buffer *src; + u64 next_offset; + int start_slot; + int nr; +}; + +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + struct extent_map *em, struct btrfs_path *path, + struct btrfs_path *dst_path, struct log_args *args) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 start = em->start; + u64 len = em->len; + u64 num_bytes; + int nritems; + int ret; + + if (BTRFS_I(inode)->logged_trans == trans->transid) { + u64 tmp; + ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, + start + len, &tmp, 0); + if (ret) + return ret; + } + + while (len) { + if (args->nr) + goto next_slot; + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (ret) { + /* + * This shouldn't happen, but it might so warn and + * return an error. + */ + WARN_ON(1); + return -ENOENT; + } + args->src = path->nodes[0]; +next_slot: + fi = btrfs_item_ptr(args->src, path->slots[0], + struct btrfs_file_extent_item); + if (args->nr && + args->start_slot + args->nr == path->slots[0]) { + args->nr++; + } else if (args->nr) { + ret = copy_items(trans, log, dst_path, args->src, + args->start_slot, args->nr, + LOG_INODE_ALL); + if (ret) + return ret; + args->nr = 1; + args->start_slot = path->slots[0]; + } else if (!args->nr) { + args->nr = 1; + args->start_slot = path->slots[0]; + } + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + num_bytes = btrfs_file_extent_num_bytes(args->src, fi); + if (len < num_bytes) { + /* I _think_ this is ok, envision we write to a + * preallocated space that is adjacent to a previously + * written preallocated space that gets merged when we + * mark this preallocated space written. If we do not + * have the adjacent extent in cache then when we copy + * this extent it could end up being larger than our EM + * thinks it is, which is a-ok, so just set len to 0. + */ + len = 0; + } else { + len -= num_bytes; + } + start += btrfs_file_extent_num_bytes(args->src, fi); + args->next_offset = start; + + if (path->slots[0] < nritems) { + if (len) + goto next_slot; + break; + } + + if (args->nr) { + ret = copy_items(trans, log, dst_path, args->src, + args->start_slot, args->nr, + LOG_INODE_ALL); + if (ret) + return ret; + args->nr = 0; + btrfs_release_path(path); + } + } + + return 0; +} + +static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + struct log_args args; + struct btrfs_root *log = root->log_root; + struct extent_map *em, *n; + struct list_head extents; + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + u64 test_gen; + int ret = 0; + + INIT_LIST_HEAD(&extents); + + memset(&args, 0, sizeof(args)); + + write_lock(&tree->lock); + test_gen = root->fs_info->last_trans_committed; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + list_del_init(&em->list); + if (em->generation <= test_gen) + continue; + list_add_tail(&em->list, &extents); + } + + list_sort(NULL, &extents, extent_cmp); + + while (!list_empty(&extents)) { + em = list_entry(extents.next, struct extent_map, list); + + list_del_init(&em->list); + + /* + * If we had an error we just need to delete everybody from our + * private list. + */ + if (ret) + continue; + + /* + * If the previous EM and the last extent we left off on aren't + * sequential then we need to copy the items we have and redo + * our search + */ + if (args.nr && em->start != args.next_offset) { + ret = copy_items(trans, log, dst_path, args.src, + args.start_slot, args.nr, + LOG_INODE_ALL); + if (ret) + continue; + btrfs_release_path(path); + args.nr = 0; + } + + ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + } + + if (!ret && args.nr) + ret = copy_items(trans, log, dst_path, args.src, + args.start_slot, args.nr, LOG_INODE_ALL); + btrfs_release_path(path); + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int nritems; int ins_start_slot = 0; int ins_nr; + bool fast_search = false; u64 ino = btrfs_ino(inode); log = root->log_root; @@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.objectid = ino; - /* today the code can only do partial logging of directories */ - if (!S_ISDIR(inode->i_mode)) - inode_only = LOG_INODE_ALL; + /* today the code can only do partial logging of directories */ if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else @@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } else { + fast_search = true; + max_key.type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, ino, + BTRFS_XATTR_ITEM_KEY); + } } if (ret) { err = ret; @@ -2960,7 +3157,18 @@ next_slot: } ins_nr = 0; } - WARN_ON(ins_nr); + + if (fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, path, + dst_path); + if (ret) { + err = ret; + goto out_unlock; + } + } + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.3 From 0fa83cdb1d72a94ea84ab6380747de6ac7cc8753 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Aug 2012 14:48:11 -0400 Subject: Btrfs: only warn if we hit an error when doing the tree logging I hit this a couple times while working on my fsync patch (all my bugs, not normal operation), but with my new stuff we could have new errors from cases I have not encountered, so instead of BUG()'ing we should be WARN()'ing so that we are notified there is a problem but the user doesn't lose their data. We can easily commit the transaction in the case that the tree logging fails and still be fine, so let's try and be as nice to the user as possible. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f2ff02c5513..94db438494d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3346,7 +3346,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - BUG_ON(ret != -ENOSPC); + WARN_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } -- cgit v1.2.3 From 06d3d22b456c2f87aeb1eb4517eeabb47e21fcc9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:19 -0600 Subject: Btrfs: cleanup extents after we finish logging inode This is based on Josef's "Btrfs: turbo charge fsync". We should cleanup those extents after we've finished logging inode, otherwise we may do redundant work on them. Signed-off-by: Liu Bo --- fs/btrfs/tree-log.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 94db438494d..58075d711d2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3167,6 +3167,12 @@ next_slot: err = ret; goto out_unlock; } + } else { + struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em, *n; + + list_for_each_entry_safe(em, n, &tree->modified_extents, list) + list_del_init(&em->list); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From ca7e70f59078046db28501519308c2061b0e7a6f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 27 Aug 2012 17:48:15 -0400 Subject: Btrfs: do not needlessly restart the transaction for enospc We will stop and restart a transaction every time we move to a different leaf when truncating a file. This is for enospc reasons, but really we could probably get away with doing this a little better by actually working until we hit an ENOSPC. So add a ->failfast flag to the block_rsv and set it when we do truncates which will fail as soon as the block rsv runs out of space, and then at that point we can stop and restart the transaction and refill the block rsv and carry on. This will make rm'ing of a file with lots of extents a bit faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode.c | 53 +++++++++++++++++--------------------------------- 3 files changed, 20 insertions(+), 36 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4b81ea3fa1b..81c772b5dc8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1034,6 +1034,7 @@ struct btrfs_block_rsv { struct btrfs_space_info *space_info; spinlock_t lock; unsigned int full; + unsigned int failfast; }; /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 62fc92f2b9d..df2f17b0ac5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6308,7 +6308,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; - if (ret) { + if (ret && !block_rsv->failfast) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, /*DEFAULT_RATELIMIT_BURST*/ 2); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1b99fe8a129..ca4fa05171a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3448,12 +3448,6 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot) { - if (root->ref_cows && - BTRFS_I(inode)->location.objectid != - BTRFS_FREE_INO_OBJECTID) { - err = -EAGAIN; - goto out; - } if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -3826,6 +3820,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } rsv->size = min_size; + rsv->failfast = 1; global_rsv = &root->fs_info->global_block_rsv; btrfs_i_size_write(inode, 0); @@ -3870,7 +3865,7 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -EAGAIN) + if (ret != -ENOSPC) break; nr = trans->blocks_used; @@ -6852,6 +6847,7 @@ static int btrfs_truncate(struct inode *inode) if (!rsv) return -ENOMEM; rsv->size = min_size; + rsv->failfast = 1; /* * 1 for the truncate slack space @@ -6905,37 +6901,13 @@ static int btrfs_truncate(struct inode *inode) * safe. */ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + trans->block_rsv = rsv; while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size); - if (ret) { - /* - * This can only happen with the original transaction we - * started above, every other time we shouldn't have a - * transaction started yet. - */ - if (ret == -EAGAIN) - goto end_trans; - err = ret; - break; - } - - if (!trans) { - /* Just need the 1 for updating the inode */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = err = PTR_ERR(trans); - trans = NULL; - break; - } - } - - trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) { + if (ret != -ENOSPC) { err = ret; break; } @@ -6946,11 +6918,22 @@ static int btrfs_truncate(struct inode *inode) err = ret; break; } -end_trans: + nr = trans->blocks_used; btrfs_end_transaction(trans, root); - trans = NULL; btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = err = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; } if (ret == 0 && inode->i_nlink > 0) { -- cgit v1.2.3 From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:20 -0600 Subject: Btrfs: improve fsync by filtering extents that we want This is based on Josef's "Btrfs: turbo charge fsync". The above Josef's patch performs very good in random sync write test, because we won't have too much extents to merge. However, it does not performs good on the test: dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync The reason is when we do sequencial sync write, we need to merge the current extent just with the previous one, so that we can get accumulated extents to log: A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ... So we'll have to flush more and more checksum into log tree, which is the bottleneck according to my tests. But we can avoid this by telling fsync the real extents that are needed to be logged. With this, I did the above dd sync write test (size=50m), w/o (orig) w/ (josef's) w/ (this) SATA 104KB/s 109KB/s 121KB/s ramdisk 1.5MB/s 1.5MB/s 10.7MB/s (613%) Signed-off-by: Liu Bo --- fs/btrfs/extent_map.c | 20 ++++++++++++++++++++ fs/btrfs/extent_map.h | 2 ++ fs/btrfs/inode.c | 1 + fs/btrfs/tree-log.c | 6 +++--- 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1fe82cfc1d9..ac606f076eb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_start = merge->block_start; merge->in_tree = 0; if (merge->generation > em->generation) { + em->mod_start = em->start; + em->mod_len = em->len; em->generation = merge->generation; list_move(&em->list, &tree->modified_extents); } @@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; if (merge->generation > em->generation) { + em->mod_len = em->len; em->generation = merge->generation; list_move(&em->list, &tree->modified_extents); } @@ -247,6 +250,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, { int ret = 0; struct extent_map *em; + bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -259,8 +263,21 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } try_merge_map(tree, em); + + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } + free_extent_map(em); out: write_unlock(&tree->lock); @@ -298,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree, } atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + try_merge_map(tree, em); out: return ret; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 2388a60bd6e..8e6294b5135 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -20,6 +20,8 @@ struct extent_map { /* all of these are in bytes */ u64 start; u64 len; + u64 mod_start; + u64 mod_len; u64 orig_start; u64 block_start; u64 block_len; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca4fa05171a..878116d9625 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1308,6 +1308,7 @@ out_check: em->block_start = disk_bytenr; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 58075d711d2..71e71539ffb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2833,8 +2833,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 start = em->start; - u64 len = em->len; + u64 start = em->mod_start; + u64 len = em->mod_len; u64 num_bytes; int nritems; int ret; @@ -2970,7 +2970,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * sequential then we need to copy the items we have and redo * our search */ - if (args.nr && em->start != args.next_offset) { + if (args.nr && em->mod_start != args.next_offset) { ret = copy_items(trans, log, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); -- cgit v1.2.3 From 321f0e70225abc792d74902a2bc4a60164265fd4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 28 Aug 2012 22:13:02 -0600 Subject: Btrfs: fix wrong orphan count of the fs/file tree If we add a new orphan item, we should increase the atomic counter, not decrease it. Fix it. Signed-off-by: Miao Xie --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 878116d9625..a6824bd0449 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2228,7 +2228,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; #endif insert = 1; - atomic_dec(&root->orphan_inodes); + atomic_inc(&root->orphan_inodes); } if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, -- cgit v1.2.3 From 46d8bc34248f3a94dea910137d1ddf5fb1e3a1cc Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Aug 2012 01:07:55 -0600 Subject: Btrfs: fix a bug in checking whether a inode is already in log This is based on Josef's "Btrfs: turbo charge fsync". The current btrfs checks if an inode is in log by comparing root's last_log_commit to inode's last_sub_trans[2]. But the problem is that this root->last_log_commit is shared among inodes. Say we have N inodes to be logged, after the first inode, root's last_log_commit is updated and the N-1 remained files will be skipped. This fixes the bug by keeping a local copy of root's last_log_commit inside each inode and this local copy will be maintained itself. [1]: we regard each log transaction as a subset of btrfs's transaction, i.e. sub_trans Signed-off-by: Liu Bo --- fs/btrfs/btrfs_inode.h | 14 ++++++-------- fs/btrfs/inode.c | 2 ++ fs/btrfs/transaction.h | 1 + fs/btrfs/tree-log.c | 1 + 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7c7bf818f3c..ed8ca7ca5ef 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -144,6 +144,9 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; + /* a local copy of root's last_log_commit */ + unsigned long last_log_commit; + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent @@ -203,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); if (BTRFS_I(inode)->logged_trans == generation && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; + BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit) + return 1; + return 0; } #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a6824bd0449..24745b8f274 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6774,6 +6774,7 @@ again: BTRFS_I(inode)->last_trans = root->fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -7018,6 +7019,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; + ei->last_log_commit = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e8b8416c688..1a138bfb8f1 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -88,6 +88,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, { BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 71e71539ffb..fc0df95c286 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3185,6 +3185,7 @@ next_slot: } } BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: mutex_unlock(&BTRFS_I(inode)->log_mutex); -- cgit v1.2.3 From d2794405113cd04f13a58e32e4d541dc87ec86e4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 29 Aug 2012 01:07:56 -0600 Subject: Btrfs: check if an inode has no checksum when logging it This is based on Josef's "Btrfs: turbo charge fsync". If an inode is a BTRFS_INODE_NODATASUM one, we don't need to look for csum items any more. Signed-off-by: Liu Bo --- fs/btrfs/tree-log.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc0df95c286..5e2ffb95cc6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2680,13 +2680,14 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, } static noinline int copy_items(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct inode *inode, struct btrfs_path *dst_path, struct extent_buffer *src, int start_slot, int nr, int inode_only) { unsigned long src_offset; unsigned long dst_offset; + struct btrfs_root *log = BTRFS_I(inode)->root->log_root; struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; int ret; @@ -2695,6 +2696,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, char *ins_data; int i; struct list_head ordered_sums; + int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); @@ -2745,7 +2747,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, struct btrfs_file_extent_item); @@ -2873,7 +2876,7 @@ next_slot: args->start_slot + args->nr == path->slots[0]) { args->nr++; } else if (args->nr) { - ret = copy_items(trans, log, dst_path, args->src, + ret = copy_items(trans, inode, dst_path, args->src, args->start_slot, args->nr, LOG_INODE_ALL); if (ret) @@ -2910,7 +2913,7 @@ next_slot: } if (args->nr) { - ret = copy_items(trans, log, dst_path, args->src, + ret = copy_items(trans, inode, dst_path, args->src, args->start_slot, args->nr, LOG_INODE_ALL); if (ret) @@ -2930,7 +2933,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path) { struct log_args args; - struct btrfs_root *log = root->log_root; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -2971,7 +2973,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * our search */ if (args.nr && em->mod_start != args.next_offset) { - ret = copy_items(trans, log, dst_path, args.src, + ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); if (ret) @@ -2984,7 +2986,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } if (!ret && args.nr) - ret = copy_items(trans, log, dst_path, args.src, + ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); WARN_ON(!list_empty(&extents)); @@ -3109,7 +3111,7 @@ again: goto next_slot; } - ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { err = ret; @@ -3127,7 +3129,7 @@ next_slot: goto again; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { @@ -3148,8 +3150,7 @@ next_slot: break; } if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, + ret = copy_items(trans, inode, dst_path, src, ins_start_slot, ins_nr, inode_only); if (ret) { err = ret; -- cgit v1.2.3 From 2671485d395c07fca104c972785898d7c52fc942 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2012 12:24:27 -0400 Subject: Btrfs: remove unused hint byte argument for btrfs_drop_extents I audited all users of btrfs_drop_extents and found that nobody actually uses the hint_byte argument. I'm sure it was used for something at some point but it's not used now, and the way the pinning works the disk bytenr would never be immediately useful anyway so lets just remove it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/file.c | 16 +++++----------- fs/btrfs/inode.c | 12 +++--------- fs/btrfs/ioctl.c | 5 ++--- fs/btrfs/tree-log.c | 7 ++----- 5 files changed, 14 insertions(+), 30 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 81c772b5dc8..71d6ff13d76 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3323,10 +3323,10 @@ extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *hint_byte, int drop_cache); + int drop_cache); int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, u64 *hint_byte, int drop_cache); + u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 399f9d71a92..58598c24995 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -584,7 +584,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *hint_byte, int drop_cache) + int drop_cache) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -716,7 +716,6 @@ next_slot: new_key.objectid, start - extent_offset, 0); BUG_ON(ret); /* -ENOMEM */ - *hint_byte = disk_bytenr; } key.offset = start; } @@ -736,10 +735,8 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_mark_buffer_dirty(leaf); - if (update_refs && disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, end - key.offset); - *hint_byte = disk_bytenr; - } break; } @@ -755,10 +752,8 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_mark_buffer_dirty(leaf); - if (update_refs && disk_bytenr > 0) { + if (update_refs && disk_bytenr > 0) inode_sub_bytes(inode, extent_end - start); - *hint_byte = disk_bytenr; - } if (end == extent_end) break; @@ -794,7 +789,6 @@ next_slot: BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); - *hint_byte = disk_bytenr; } if (end == extent_end) @@ -834,7 +828,7 @@ next_slot: int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, u64 *hint_byte, int drop_cache) + u64 end, int drop_cache) { struct btrfs_path *path; int ret; @@ -843,7 +837,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, - hint_byte, drop_cache); + drop_cache); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 24745b8f274..f0a4792492e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 inline_len = actual_end - start; u64 aligned_end = (end + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - u64 hint_byte; u64 data_len = inline_len; int ret; @@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, - &hint_byte, 1); + ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); if (ret) return ret; @@ -1786,7 +1784,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; - u64 hint; int ret; path = btrfs_alloc_path(); @@ -1805,8 +1802,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * with the others. */ ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, - &hint, 0); + file_pos + num_bytes, 0); if (ret) goto out; @@ -3629,7 +3625,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) last_byte = (last_byte + mask) & ~mask; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - u64 hint_byte = 0; hole_size = last_byte - cur_offset; trans = btrfs_start_transaction(root, 3); @@ -3640,8 +3635,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, - cur_offset + hole_size, - &hint_byte, 1); + cur_offset + hole_size, 1); if (err) { btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 95223222d5a..5543fd562b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2353,7 +2353,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; - u64 hint_byte; /* * TODO: @@ -2579,7 +2578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = btrfs_drop_extents(trans, root, inode, new_key.offset, new_key.offset + datal, - &hint_byte, 1); + 1); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2653,7 +2652,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = btrfs_drop_extents(trans, root, inode, new_key.offset, new_key.offset + datal, - &hint_byte, 1); + 1); if (ret) { btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5e2ffb95cc6..0c39f58973d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -485,7 +485,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, int found_type; u64 mask = root->sectorsize - 1; u64 extent_end; - u64 alloc_hint; u64 start = key->offset; u64 saved_nbytes; struct btrfs_file_extent_item *item; @@ -551,8 +550,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, start, extent_end, - &alloc_hint, 1); + ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -2843,9 +2841,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; if (BTRFS_I(inode)->logged_trans == trans->transid) { - u64 tmp; ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, &tmp, 0); + start + len, 0); if (ret) return ret; } -- cgit v1.2.3 From 2aaa66558172b017f36bf38ae69372813dedee9d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Aug 2012 14:27:18 -0400 Subject: Btrfs: add hole punching This patch adds hole punching via fallocate. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 +- fs/btrfs/extent-tree.c | 2 + fs/btrfs/file.c | 332 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/inode.c | 28 +++-- fs/btrfs/tree-log.c | 2 +- 5 files changed, 355 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 71d6ff13d76..88adfe63840 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3250,6 +3250,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len); +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, @@ -3323,7 +3325,7 @@ extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - int drop_cache); + u64 *drop_end, int drop_cache); int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index df2f17b0ac5..ee51b7afe97 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4132,6 +4132,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { + if (!rsv) + return; btrfs_block_rsv_release(root, rsv, (u64)-1); kfree(rsv); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58598c24995..57026a6e9c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -39,6 +39,7 @@ #include "tree-log.h" #include "locking.h" #include "compat.h" +#include "volumes.h" /* * when auto defrag is enabled we @@ -584,7 +585,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - int drop_cache) + u64 *drop_end, int drop_cache) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -822,6 +823,8 @@ next_slot: btrfs_abort_transaction(trans, root, ret); } + if (drop_end) + *drop_end = min(end, extent_end); btrfs_release_path(path); return ret; } @@ -836,7 +839,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, inode, path, start, end, + ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, drop_cache); btrfs_free_path(path); return ret; @@ -1645,6 +1648,324 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_disk_bytenr(leaf, fi)) + return 0; + + if (key.offset == end) + return 1; + if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) + return 1; + return 0; +} + +static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, + struct btrfs_path *path, u64 offset, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct extent_map *hole_em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = offset; + + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + return ret; + BUG_ON(!ret); + + leaf = path->nodes[0]; + if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { + u64 num_bytes; + + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + + end - offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + + if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + u64 num_bytes; + + path->slots[0]++; + key.offset = offset; + btrfs_set_item_key_safe(trans, root, path, &key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - + offset; + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); + goto out; + } + btrfs_release_path(path); + + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, + 0, 0, end - offset, 0, end - offset, + 0, 0, 0); + if (ret) + return ret; + +out: + btrfs_release_path(path); + + hole_em = alloc_extent_map(); + if (!hole_em) { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + hole_em->start = offset; + hole_em->len = end - offset; + hole_em->orig_start = offset; + + hole_em->block_start = EXTENT_MAP_HOLE; + hole_em->block_len = 0; + hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->compress_type = BTRFS_COMPRESS_NONE; + hole_em->generation = trans->transid; + + do { + btrfs_drop_extent_cache(inode, offset, end - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, hole_em); + if (!ret) + list_move(&hole_em->list, + &em_tree->modified_extents); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + free_extent_map(hole_em); + if (ret) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } + + return 0; +} + +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_block_rsv *rsv; + struct btrfs_trans_handle *trans; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + u64 lockstart = (offset + mask) & ~mask; + u64 lockend = ((offset + len) & ~mask) - 1; + u64 cur_offset = lockstart; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 drop_end; + unsigned long nr; + int ret = 0; + int err = 0; + bool same_page = (offset >> PAGE_CACHE_SHIFT) == + ((offset + len) >> PAGE_CACHE_SHIFT); + + btrfs_wait_ordered_range(inode, offset, len); + + mutex_lock(&inode->i_mutex); + if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + /* + * Only do this if we are in the same page and we aren't doing the + * entire page. + */ + if (same_page && len < PAGE_CACHE_SIZE) { + ret = btrfs_truncate_page(inode, offset, len, 0); + mutex_unlock(&inode->i_mutex); + return ret; + } + + /* zero back part of the first page */ + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + /* zero the front end of the last page */ + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + if (lockend < lockstart) { + mutex_unlock(&inode->i_mutex); + return 0; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len < lockstart || + ordered->file_offset > lockend)) && + !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_UPTODATE, 0, + cached_state)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_NOFS); + btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) { + ret = -ENOMEM; + goto out_free; + } + rsv->size = btrfs_calc_trunc_metadata_size(root, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); + BUG_ON(ret); + trans->block_rsv = rsv; + + while (cur_offset < lockend) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, lockend + 1, + &drop_end, 1); + if (ret != -ENOSPC) + break; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + break; + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + err = ret; + break; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + rsv, min_size); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + } + + if (ret) { + err = ret; + goto out_trans; + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +out_free: + btrfs_free_path(path); + btrfs_free_block_rsv(root, rsv); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + mutex_unlock(&inode->i_mutex); + if (ret && !err) + err = ret; + return err; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -1663,10 +1984,13 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) + /* Make sure we aren't being give some crap mode */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + if (mode & FALLOC_FL_PUNCH_HOLE) + return btrfs_punch_hole(inode, offset, len); + /* * Make sure we have enough space before we do the * allocation. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0a4792492e..8cb272c8408 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3475,12 +3475,20 @@ error: } /* - * taken from block_truncate_page, but does cow as it zeros out - * any bytes left in the last page in the file. + * btrfs_truncate_page - read, zero a chunk and write a page + * @inode - inode that we're zeroing + * @from - the offset to start zeroing + * @len - the length to zero, 0 to zero the entire range respective to the + * offset + * @front - zero up to the offset instead of from the offset on + * + * This will find the page for the "from" offset and cow the page and zero the + * part we want to zero. This is used with truncate and hole punching. */ -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front) { - struct inode *inode = mapping->host; + struct address_space *mapping = inode->i_mapping; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; @@ -3495,7 +3503,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) u64 page_start; u64 page_end; - if ((offset & (blocksize - 1)) == 0) + if ((offset & (blocksize - 1)) == 0 && + (!len || ((len & (blocksize - 1)) == 0))) goto out; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) @@ -3555,8 +3564,13 @@ again: ret = 0; if (offset != PAGE_CACHE_SIZE) { + if (!len) + len = PAGE_CACHE_SIZE - offset; kaddr = kmap(page); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + if (front) + memset(kaddr, 0, offset); + else + memset(kaddr + offset, 0, len); flush_dcache_page(page); kunmap(page); } @@ -6796,7 +6810,7 @@ static int btrfs_truncate(struct inode *inode) u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); if (ret) return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0c39f58973d..f9b0fc91166 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2842,7 +2842,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->logged_trans == trans->transid) { ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, 0); + start + len, NULL, 0); if (ret) return ret; } -- cgit v1.2.3 From 6fc4e3548598d10a5e947797a09cbc1b257a22ab Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 30 Aug 2012 16:26:15 -0600 Subject: Btrfs: pass lockdep rwsem metadata to async commit transaction The freeze rwsem is taken by sb_start_intwrite() and dropped during the commit_ or end_transaction(). In the async case, that happens in a worker thread. Tell lockdep the calling thread is releasing ownership of the rwsem and the async thread is picking it up. XFS plays the same trick in fs/xfs/xfs_aops.c. Signed-off-by: Sage Weil --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 27c26004e05..730c94590c9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1229,6 +1229,14 @@ static void do_async_commit(struct work_struct *work) struct btrfs_async_commit *ac = container_of(work, struct btrfs_async_commit, work.work); + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + btrfs_commit_transaction(ac->newtrans, ac->root); kfree(ac); } @@ -1258,6 +1266,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, atomic_inc(&cur_trans->use_count); btrfs_end_transaction(trans, root); + + /* + * Tell lockdep we've released the freeze rwsem, since the + * async commit thread will be the one to unlock it. + */ + rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + schedule_delayed_work(&ac->work, 0); /* wait for transaction to start and unblock */ -- cgit v1.2.3 From e209db7ace281ca347b1ac699bf1fb222eac03fe Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 30 Aug 2012 16:26:16 -0600 Subject: Btrfs: set journal_info in async trans commit worker We expect current->journal_info to point to the trans handle we are committing. Signed-off-by: Sage Weil --- fs/btrfs/transaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 730c94590c9..ffc6b5202d5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1237,6 +1237,8 @@ static void do_async_commit(struct work_struct *work) &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 0, 1, _THIS_IP_); + current->journal_info = ac->newtrans; + btrfs_commit_transaction(ac->newtrans, ac->root); kfree(ac); } -- cgit v1.2.3 From ac14aed66558d686b1f95dac1f07ecfe11d8c30e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 30 Aug 2012 16:26:17 -0600 Subject: Btrfs: do not take cleanup_work_sem in btrfs_run_delayed_iputs() Josef has suggested that this is not necessary. Removing it also avoids this lockdep splat (after the new sb_internal locking stuff was added): [ 604.090449] ====================================================== [ 604.114819] [ INFO: possible circular locking dependency detected ] [ 604.139262] 3.6.0-rc2-ceph-00144-g463b030 #1 Not tainted [ 604.162193] ------------------------------------------------------- [ 604.186139] btrfs-cleaner/6669 is trying to acquire lock: [ 604.209555] (sb_internal#2){.+.+..}, at: [] start_transaction+0x124/0x430 [btrfs] [ 604.257100] [ 604.257100] but task is already holding lock: [ 604.300366] (&fs_info->cleanup_work_sem){.+.+..}, at: [] btrfs_run_delayed_iputs+0x72/0x130 [btrfs] [ 604.352989] [ 604.352989] which lock already depends on the new lock. [ 604.352989] [ 604.427104] [ 604.427104] the existing dependency chain (in reverse order) is: [ 604.478493] [ 604.478493] -> #1 (&fs_info->cleanup_work_sem){.+.+..}: [ 604.529313] [] lock_acquire+0xa2/0x140 [ 604.559621] [] down_read+0x39/0x4e [ 604.589382] [] btrfs_lookup_dentry+0x218/0x550 [btrfs] [ 604.596161] btrfs: unlinked 1 orphans [ 604.675002] [] create_subvol+0x62d/0x690 [btrfs] [ 604.708859] [] btrfs_mksubvol.isra.52+0x346/0x3a0 [btrfs] [ 604.772466] [] btrfs_ioctl_snap_create_transid+0x132/0x190 [btrfs] [ 604.842245] [] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [ 604.912852] [] btrfs_ioctl+0x138e/0x1990 [btrfs] [ 604.951888] [] do_vfs_ioctl+0x98/0x560 [ 604.989961] [] sys_ioctl+0x91/0xa0 [ 605.026628] [] system_call_fastpath+0x16/0x1b [ 605.064404] [ 605.064404] -> #0 (sb_internal#2){.+.+..}: [ 605.126832] [] __lock_acquire+0x1ac8/0x1b90 [ 605.163671] [] lock_acquire+0xa2/0x140 [ 605.200228] [] __sb_start_write+0xc6/0x1b0 [ 605.236818] [] start_transaction+0x124/0x430 [btrfs] [ 605.274029] [] btrfs_start_transaction+0x13/0x20 [btrfs] [ 605.340520] [] btrfs_evict_inode+0x19a/0x330 [btrfs] [ 605.378720] [] evict+0xb8/0x1c0 [ 605.416057] [] iput+0x105/0x210 [ 605.452373] [] btrfs_run_delayed_iputs+0xf2/0x130 [btrfs] [ 605.521627] [] cleaner_kthread+0xa1/0x120 [btrfs] [ 605.560520] [] kthread+0xae/0xc0 [ 605.598094] [] kernel_thread_helper+0x4/0x10 [ 605.636499] [ 605.636499] other info that might help us debug this: [ 605.636499] [ 605.736504] Possible unsafe locking scenario: [ 605.736504] [ 605.801931] CPU0 CPU1 [ 605.835126] ---- ---- [ 605.867093] lock(&fs_info->cleanup_work_sem); [ 605.898594] lock(sb_internal#2); [ 605.931954] lock(&fs_info->cleanup_work_sem); [ 605.965359] lock(sb_internal#2); [ 605.994758] [ 605.994758] *** DEADLOCK *** [ 605.994758] [ 606.075281] 2 locks held by btrfs-cleaner/6669: [ 606.104528] #0: (&fs_info->cleaner_mutex){+.+...}, at: [] cleaner_kthread+0x95/0x120 [btrfs] [ 606.165626] #1: (&fs_info->cleanup_work_sem){.+.+..}, at: [] btrfs_run_delayed_iputs+0x72/0x130 [btrfs] [ 606.231297] [ 606.231297] stack backtrace: [ 606.287723] Pid: 6669, comm: btrfs-cleaner Not tainted 3.6.0-rc2-ceph-00144-g463b030 #1 [ 606.347823] Call Trace: [ 606.376184] [] print_circular_bug+0x1fb/0x20c [ 606.409243] [] __lock_acquire+0x1ac8/0x1b90 [ 606.441343] [] ? start_transaction+0x124/0x430 [btrfs] [ 606.474583] [] lock_acquire+0xa2/0x140 [ 606.505934] [] ? start_transaction+0x124/0x430 [btrfs] [ 606.539429] [] ? do_raw_spin_unlock+0x5d/0xb0 [ 606.571719] [] __sb_start_write+0xc6/0x1b0 [ 606.603498] [] ? start_transaction+0x124/0x430 [btrfs] [ 606.637405] [] ? start_transaction+0x124/0x430 [btrfs] [ 606.670165] [] ? kmem_cache_alloc+0xb5/0x160 [ 606.702144] [] start_transaction+0x124/0x430 [btrfs] [ 606.735562] [] ? block_rsv_add_bytes+0x56/0x80 [btrfs] [ 606.769861] [] btrfs_start_transaction+0x13/0x20 [btrfs] [ 606.804575] [] btrfs_evict_inode+0x19a/0x330 [btrfs] [ 606.838756] [] ? _raw_spin_unlock+0x2b/0x40 [ 606.872010] [] evict+0xb8/0x1c0 [ 606.903800] [] iput+0x105/0x210 [ 606.935416] [] btrfs_run_delayed_iputs+0xf2/0x130 [btrfs] [ 606.970510] [] ? cleaner_kthread+0x95/0x120 [btrfs] [ 607.005648] [] cleaner_kthread+0xa1/0x120 [btrfs] [ 607.040724] [] ? btrfs_destroy_delayed_refs.isra.102+0x220/0x220 [btrfs] [ 607.104740] [] kthread+0xae/0xc0 [ 607.137119] [] ? trace_hardirqs_on+0xd/0x10 [ 607.169797] [] kernel_thread_helper+0x4/0x10 [ 607.202472] [] ? retint_restore_args+0x13/0x13 [ 607.235884] [] ? flush_kthread_work+0x1a0/0x1a0 [ 607.268731] [] ? gs_change+0x13/0x13 Signed-off-by: Sage Weil --- fs/btrfs/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cb272c8408..073af0724bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2118,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; - down_read(&root->fs_info->cleanup_work_sem); spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -2129,7 +2128,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } - up_read(&root->fs_info->cleanup_work_sem); } enum btrfs_orphan_cleanup_state { -- cgit v1.2.3 From 7014cdb49305eda0767d2ae6136f8c191ea8fd81 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 30 Aug 2012 20:06:49 -0400 Subject: Btrfs: btrfs_drop_extent_cache should never fail I noticed this when I was doing the fsync stuff, we allocate split extents if we drop an extent range that is in the middle of an existing extent. This BUG()'s if we fail to allocate memory, but the fact is this is just a cache, we will just regenerate the cache if we need it, the important part is that we free the range we are given. This can be done without allocations, so if we fail to allocate splits just skip the splitting stage and free our em and look for more extents to drop. This also makes btrfs_drop_extent_cache a void since nobody was checking the return value anyway. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/file.c | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 88adfe63840..b7cd3adb5a5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3316,8 +3316,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned); +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, u64 start, u64 end, int skip_pinned, int modified); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 57026a6e9c9..a50e98733e2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -459,8 +459,8 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, * this drops all the extents in the cache that intersect the range * [start, end]. Existing extents are split as required. */ -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) { struct extent_map *em; struct extent_map *split = NULL; @@ -479,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, testend = 0; } while (1) { + int no_splits = 0; + if (!split) split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); - BUG_ON(!split || !split2); /* -ENOMEM */ + if (!split || !split2) + no_splits = 1; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -509,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); remove_extent_mapping(em_tree, em); + if (no_splits) + goto next; if (em->block_start < EXTENT_MAP_LAST_BYTE && em->start < start) { @@ -559,6 +564,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); split = NULL; } +next: write_unlock(&em_tree->lock); /* once for us */ @@ -570,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); if (split2) free_extent_map(split2); - return 0; } /* -- cgit v1.2.3 From cf93dccea67ad8f5e0d9163c6a0a584550bbd7cd Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 2 Sep 2012 07:44:51 -0600 Subject: Btrfs: fix possible memory leak in scrub_setup_recheck_block() bbio has been malloced in btrfs_map_block() and should be freed before leaving from the error handling cases. spatch with a semantic match is used to found this problem. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun --- fs/btrfs/scrub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b223620cd5a..4e9eafe01c5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1029,6 +1029,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, spin_lock(&sdev->stat_lock); sdev->stat.malloc_errors++; spin_unlock(&sdev->stat_lock); + kfree(bbio); return -ENOMEM; } sblock->page_count++; -- cgit v1.2.3 From 6fa9700e734275de2acbcb0e99414bd7ddfc60f1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:00:32 -0600 Subject: Btrfs: fix error path in create_pending_snapshot() This patch fixes the following problem: - If we failed to deal with the delayed dir items, we should abort transaction, just as its comment said. Fix it. - If root reference or root back reference insertion failed, we should abort transaction. Fix it. - Fix the double free problem of pending->inherit. - Do not restore the trans->rsv if we doesn't change it. - make the error path more clearly. Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ffc6b5202d5..7be03185327 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -967,18 +967,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - rsv = trans->block_rsv; - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { ret = pending->error = -ENOMEM; - goto fail; + goto root_item_alloc_fail; } ret = btrfs_find_free_objectid(tree_root, &objectid); if (ret) { pending->error = ret; - goto fail; + goto no_free_objectid; } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); @@ -988,22 +986,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve); if (ret) { pending->error = ret; - goto fail; + goto no_free_objectid; } } ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, objectid, pending->inherit); - kfree(pending->inherit); if (ret) { pending->error = ret; - goto fail; + goto no_free_objectid; } key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; dentry = pending->dentry; @@ -1023,10 +1021,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_FT_DIR, index); if (ret == -EEXIST) { pending->error = -EEXIST; - dput(parent); goto fail; } else if (ret) { - goto abort_trans_dput; + goto abort_trans; } btrfs_i_size_write(parent_inode, parent_inode->i_size + @@ -1034,7 +1031,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, parent_root, parent_inode); if (ret) - goto abort_trans_dput; + goto abort_trans; /* * pull in the delayed directory update @@ -1043,10 +1040,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans, root); - if (ret) { /* Transaction aborted */ - dput(parent); - goto fail; - } + if (ret) /* Transaction aborted */ + goto abort_trans; record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); @@ -1079,7 +1074,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - goto abort_trans_dput; + goto abort_trans; } btrfs_set_lock_blocking(old); @@ -1089,7 +1084,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) - goto abort_trans_dput; + goto abort_trans; /* see comments in should_cow_block() */ root->force_cow = 1; @@ -1102,7 +1097,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) - goto abort_trans_dput; + goto abort_trans; /* * insert root back/forward references @@ -1111,9 +1106,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); - dput(parent); if (ret) - goto fail; + goto abort_trans; key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); @@ -1125,15 +1119,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) goto abort_trans; - ret = 0; fail: - kfree(new_root_item); + dput(parent); trans->block_rsv = rsv; +no_free_objectid: + kfree(new_root_item); +root_item_alloc_fail: btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; -abort_trans_dput: - dput(parent); abort_trans: btrfs_abort_transaction(trans, root, ret); goto fail; -- cgit v1.2.3 From 361048f586f59d414421c6486dd846063a0cac98 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:00:57 -0600 Subject: Btrfs: fix full backref problem when inserting shared block reference If we create several snapshots at the same time, the following BUG_ON() will be triggered. kernel BUG at fs/btrfs/extent-tree.c:6047! Steps to reproduce: # mkfs.btrfs # mount # cd # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done # for ((i=0; i<4; i++)) > do > mkdir $i > for ((j=0; j<200; j++)) > do > btrfs sub snap . $i/$j > done & > done The reason is: Before transaction commit, some operations changed the fs tree and new tree blocks were allocated because of COW. We used the implicit non-shared back reference for those newly allocated tree blocks because they were not shared by two or more trees. And then we created the first snapshot for the fs tree, according to the back reference rules, we also used implicit back refs for the child tree blocks of the root node of the fs tree, now those child nodes/leaves were shared by two trees. Then We didn't deal with the delayed references, and continued to change the fs tree(created the second snapshot and inserted the dir item of the new snapshot into the fs tree). According to the rules of the back reference, we added full back refs for those tree blocks whose parents have be shared by two trees. Now some newly allocated tree blocks had two types of the references. As we know, the delayed reference system handles these delayed references from back to front, and the full delayed reference is inserted after the implicit ones. So when we dealt with the back references of those newly allocated tree blocks, the full references was dealt with at first. And if the first reference is a shared back reference and the tree block that the reference points to is newly allocated, It would be considered as a tree block which is shared by two or more trees when it is allocated and should be a full back reference not a implicit one, the flag of its reference also should be set to FULL_BACKREF. But in fact, it was a non-shared tree block with a implicit reference at beginning, so it was not compulsory to set the flags to FULL_BACKREF. So BUG_ON was triggered. We have several methods to fix this bug: 1. deal with delayed references after the snapshot is created and before we change the source tree of the snapshot. This is the easiest and safest way. 2. modify the sort method of the delayed reference tree, make the full delayed references be inserted before the implicit ones. It is also very easy, but I don't know if it will introduce some problems or not. 3. modify select_delayed_ref() and make it select the implicit delayed reference at first. This way is not so good because it may wastes CPU time if we have lots of delayed references. 4. set the flags to FULL_BACKREF, this method is a little complex comparing with the 1st way. I chose the 1st way to fix it. Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7be03185327..4b6ce5cab44 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1119,6 +1119,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) goto abort_trans; + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + goto abort_trans; fail: dput(parent); trans->block_rsv = rsv; -- cgit v1.2.3 From b9a8cc5bef963b76c5b6c3016b7e91988a3e758b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:01:21 -0600 Subject: Btrfs: fix file extent discount problem in the, snapshot If a snapshot is created while we are writing some data into the file, the i_size of the corresponding file in the snapshot will be wrong, it will be beyond the end of the last file extent. And btrfsck will report: root 256 inode 257 errors 100 Steps to reproduce: # mkfs.btrfs # mount # cd # dd if=/dev/zero of=tmpfile bs=4M count=1024 & # for ((i=0; i<4; i++)) > do > btrfs sub snap . $i > done This because the algorithm of disk_i_size update is wrong. Though there are some ordered extents behind the current one which we use to update disk_i_size, it doesn't mean those extents will be dealt with in the same transaction. So We shouldn't use the offset of those extents to update disk_i_size. Or we will get the wrong i_size in the snapshot. We fix this problem by recording the max real i_size. If we find there is a ordered extent which is in front of the current one and doesn't complete, we will record the end of the current one into that ordered extent. Surely, if the current extent holds the end of other extent(it must be greater than the current one because it is behind the current one), we will record the number that the current extent holds. In this way, we can exclude the ordered extents that may not be dealth with in the same transaction, and be easy to know the real disk_i_size. Signed-off-by: Miao Xie --- fs/btrfs/ordered-data.c | 62 ++++++++++++++----------------------------------- fs/btrfs/ordered-data.h | 7 ++++++ 2 files changed, 25 insertions(+), 44 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 051c7fe551d..cd8ecb73c05 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -775,7 +775,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; u64 disk_i_size; u64 new_i_size; - u64 i_size_test; u64 i_size = i_size_read(inode); struct rb_node *node; struct rb_node *prev = NULL; @@ -835,55 +834,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) + if (test->file_offset >= disk_i_size) { + /* + * we don't update disk_i_size now, so record this + * undealt i_size. Or we will not know the real + * i_size. + */ + if (test->outstanding_isize < offset) + test->outstanding_isize = offset; + if (ordered && + ordered->outstanding_isize > + test->outstanding_isize) + test->outstanding_isize = + ordered->outstanding_isize; goto out; - } - new_i_size = min_t(u64, offset, i_size); - - /* - * at this point, we know we can safely update i_size to at least - * the offset from this ordered extent. But, we need to - * walk forward and see if ios from higher up in the file have - * finished. - */ - if (ordered) { - node = rb_next(&ordered->rb_node); - } else { - if (prev) - node = rb_next(prev); - else - node = rb_first(&tree->tree); - } - - /* - * We are looking for an area between our current extent and the next - * ordered extent to update the i_size to. There are 3 cases here - * - * 1) We don't actually have anything and we can update to i_size. - * 2) We have stuff but they already did their i_size update so again we - * can just update to i_size. - * 3) We have an outstanding ordered extent so the most we can update - * our disk_i_size to is the start of the next offset. - */ - i_size_test = i_size; - for (; node; node = rb_next(node)) { - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - - if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) - continue; - if (test->file_offset > offset) { - i_size_test = test->file_offset; - break; } } + new_i_size = min_t(u64, offset, i_size); /* - * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents, we can safely set - * disk_i_size to this. + * Some ordered extents may completed before the current one, and + * we hold the real i_size in ->outstanding_isize. */ - if (i_size_test > offset) - new_i_size = min_t(u64, i_size_test, i_size); + if (ordered && ordered->outstanding_isize > new_i_size) + new_i_size = min_t(u64, ordered->outstanding_isize, i_size); BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e03c560d299..c2443a431ca 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -96,6 +96,13 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; + /* + * the end of the ordered extent which is behind it but + * didn't update disk_i_size. Please see the comment of + * btrfs_ordered_update_i_size(); + */ + u64 outstanding_isize; + /* flags (described above) */ unsigned long flags; -- cgit v1.2.3 From 6352b91da1a2108bb8cc5115e8714f90d706f15f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:01:51 -0600 Subject: Btrfs: use a slab for ordered extents allocation The ordered extent allocation is in the fast path of the IO, so use a slab to improve the speed of the allocation. "Size of the struct is 280, so this will fall into the size-512 bucket, giving 8 objects per page, while own slab will pack 14 objects into a page. Another benefit I see is to check for leaked objects when the module is removed (and the cache destroy takes place)." -- David Sterba Signed-off-by: Miao Xie --- fs/btrfs/ordered-data.c | 23 +++++++++++++++++++++-- fs/btrfs/ordered-data.h | 2 ++ fs/btrfs/super.c | 9 ++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index cd8ecb73c05..e2b3d994ec0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -25,6 +25,8 @@ #include "btrfs_inode.h" #include "extent_io.h" +static struct kmem_cache *btrfs_ordered_extent_cache; + static u64 entry_end(struct btrfs_ordered_extent *entry) { if (entry->file_offset + entry->len < entry->file_offset) @@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, struct btrfs_ordered_extent *entry; tree = &BTRFS_I(inode)->ordered_tree; - entry = kzalloc(sizeof(*entry), GFP_NOFS); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) return -ENOMEM; @@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) list_del(&sum->list); kfree(sum); } - kfree(entry); + kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -958,3 +960,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, } spin_unlock(&root->fs_info->ordered_extent_lock); } + +int __init ordered_data_init(void) +{ + btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", + sizeof(struct btrfs_ordered_extent), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_ordered_extent_cache) + return -ENOMEM; + return 0; +} + +void ordered_data_exit(void) +{ + if (btrfs_ordered_extent_cache) + kmem_cache_destroy(btrfs_ordered_extent_cache); +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c2443a431ca..d1ddaeff135 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -192,4 +192,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only, int delay_iput); +int __init ordered_data_init(void); +void ordered_data_exit(void); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 83d6f9f9c22..06ff1dd0f9b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1620,10 +1620,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_extent_io; - err = btrfs_delayed_inode_init(); + err = ordered_data_init(); if (err) goto free_extent_map; + err = btrfs_delayed_inode_init(); + if (err) + goto free_ordered_data; + err = btrfs_interface_init(); if (err) goto free_delayed_inode; @@ -1641,6 +1645,8 @@ unregister_ioctl: btrfs_interface_exit(); free_delayed_inode: btrfs_delayed_inode_exit(); +free_ordered_data: + ordered_data_exit(); free_extent_map: extent_map_exit(); free_extent_io: @@ -1657,6 +1663,7 @@ static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); btrfs_delayed_inode_exit(); + ordered_data_exit(); extent_map_exit(); extent_io_exit(); btrfs_interface_exit(); -- cgit v1.2.3 From 66d8f3dd1c87813d7f1cf8b774cb03e9b8d7e87e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:02:28 -0600 Subject: Btrfs: add a new "type" field into the block reservation structure Sometimes we need choose the method of the reservation according to the type of the block reservation, such as the reservation for the delayed inode update. Now we identify the type just by comparing the address of the reservation variants, it is very ugly if it is a temporary one because we need compare it with all the common reservation variants. So we add a new "type" field to keep the type the reservation variants. Signed-off-by: Miao Xie --- fs/btrfs/ctree.h | 18 ++++++++++++++---- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 15 +++++++++------ fs/btrfs/extent-tree.c | 8 +++++--- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 3 ++- fs/btrfs/relocation.c | 3 ++- 8 files changed, 39 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7cd3adb5a5..2990a7ea624 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1028,13 +1028,22 @@ struct btrfs_space_info { wait_queue_head_t wait; }; +#define BTRFS_BLOCK_RSV_GLOBAL 1 +#define BTRFS_BLOCK_RSV_DELALLOC 2 +#define BTRFS_BLOCK_RSV_TRANS 3 +#define BTRFS_BLOCK_RSV_CHUNK 4 +#define BTRFS_BLOCK_RSV_DELOPS 5 +#define BTRFS_BLOCK_RSV_EMPTY 6 +#define BTRFS_BLOCK_RSV_TEMP 7 + struct btrfs_block_rsv { u64 size; u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full; - unsigned int failfast; + unsigned short full; + unsigned short type; + unsigned short failfast; }; /* @@ -2875,8 +2884,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 07d5eeb1e6f..eb768c4c135 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata( * we're accounted for. */ if (!src_rsv || (!trans->bytes_reserved && - src_rsv != &root->fs_info->delalloc_block_rsv)) { + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); /* * Since we're under a transaction reserve_metadata_bytes could @@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes, 1); } return ret; - } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { spin_lock(&BTRFS_I(inode)->lock); if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, &BTRFS_I(inode)->runtime_flags)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 22e98e04c2e..0dcfb998229 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2014,12 +2014,15 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); - btrfs_init_block_rsv(&fs_info->global_block_rsv); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); - btrfs_init_block_rsv(&fs_info->trans_block_rsv); - btrfs_init_block_rsv(&fs_info->chunk_block_rsv); - btrfs_init_block_rsv(&fs_info->empty_block_rsv); - btrfs_init_block_rsv(&fs_info->delayed_block_rsv); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ee51b7afe97..36e03312267 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4108,13 +4108,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); + rsv->type = type; } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type) { struct btrfs_block_rsv *block_rsv; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4123,7 +4125,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv); + btrfs_init_block_rsv(block_rsv, type); block_rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); return block_rsv; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a50e98733e2..a9d7815cf58 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1874,7 +1874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - rsv = btrfs_alloc_block_rsv(root); + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { ret = -ENOMEM; goto out_free; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 073af0724bc..d34eb329720 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2195,7 +2195,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) int ret; if (!root->orphan_block_rsv) { - block_rsv = btrfs_alloc_block_rsv(root); + block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!block_rsv) return -ENOMEM; } @@ -3070,7 +3070,7 @@ out: static void __unlink_end_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (trans->block_rsv == &root->fs_info->global_block_rsv) { + if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3821,7 +3821,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - rsv = btrfs_alloc_block_rsv(root); + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); goto no_delete; @@ -6851,7 +6851,7 @@ static int btrfs_truncate(struct inode *inode) * 3) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(root); + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) return -ENOMEM; rsv->size = min_size; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5543fd562b5..e6934de55a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -516,7 +516,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!pending_snapshot) return -ENOMEM; - btrfs_init_block_rsv(&pending_snapshot->block_rsv); + btrfs_init_block_rsv(&pending_snapshot->block_rsv, + BTRFS_BLOCK_RSV_TEMP); pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4da08652004..5a15c96a18f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc) struct btrfs_trans_handle *trans; int ret; - rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, + BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; -- cgit v1.2.3 From 42874b3db7817f662b1d7c6e32f8b63638fa0321 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:03:32 -0600 Subject: Btrfs: fix the snapshot that should not exist The snapshot should be the image of the fs tree before it was created, so the metadata of the snapshot should not exist in the its tree. But now, we found the directory item and directory name index is in both the snapshot tree and the fs tree. It introduces some problems and makes the users feel strange: # mkfs.btrfs /dev/sda1 # mount /dev/sda1 /mnt # mkdir /mnt/1 # cd /mnt/1 # btrfs subvolume snapshot /mnt snap0 # ls -a /mnt/1/snap0/1 . .. [no other file/dir] # ll /mnt/1/snap0/ total 0 drwxr-xr-x 1 root root 10 Ju1 24 12:11 1 ^^^ There is no file/dir in it, but it's size is 10 # cd /mnt/1/snap0/1/snap0 [Enter a unexisted directory successfully...] There is nothing in the directory 1 in snap0, but btrfs told the length of this directory is 10. Beside that, we can enter an unexisted directory, it is very strange to the users. # btrfs subvolume snapshot /mnt/1/snap0 /mnt/snap1 # ll /mnt/1/snap0/1/ total 0 [None] # ll /mnt/snap1/1/ total 0 drwxr-xr-x 1 root root 0 Ju1 24 12:14 snap0 And the source of snap1 did have any directory in Directory 1, but snap1 have a snap0, it is different between the source and the snapshot. So I think we should insert directory item and directory name index and update the parent inode as the last step of snapshot creation, and do not leave the useless metadata in the file tree. Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 68 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4b6ce5cab44..a86fc723aad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -955,6 +955,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct inode *parent_inode; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; @@ -967,6 +969,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; + path = btrfs_alloc_path(); + if (!path) { + ret = pending->error = -ENOMEM; + goto path_alloc_fail; + } + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { ret = pending->error = -ENOMEM; @@ -1015,23 +1023,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_set_inode_index(parent_inode, &index); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_insert_dir_item(trans, parent_root, - dentry->d_name.name, dentry->d_name.len, - parent_inode, &key, - BTRFS_FT_DIR, index); - if (ret == -EEXIST) { + + /* check if there is a file/dir which has the same name. */ + dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, + btrfs_ino(parent_inode), + dentry->d_name.name, + dentry->d_name.len, 0); + if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto fail; - } else if (ret) { + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); goto abort_trans; } - - btrfs_i_size_write(parent_inode, parent_inode->i_size + - dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, parent_root, parent_inode); - if (ret) - goto abort_trans; + btrfs_release_path(path); /* * pull in the delayed directory update @@ -1123,12 +1128,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) goto abort_trans; + + ret = btrfs_insert_dir_item(trans, parent_root, + dentry->d_name.name, dentry->d_name.len, + parent_inode, &key, + BTRFS_FT_DIR, index); + /* We have check then name at the beginning, so it is impossible. */ + BUG_ON(ret == -EEXIST); + if (ret) + goto abort_trans; + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, parent_root, parent_inode); + if (ret) + goto abort_trans; fail: dput(parent); trans->block_rsv = rsv; no_free_objectid: kfree(new_root_item); root_item_alloc_fail: + btrfs_free_path(path); +path_alloc_fail: btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; @@ -1472,13 +1495,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_lock(&root->fs_info->reloc_mutex); - ret = btrfs_run_delayed_items(trans, root); + /* + * We needn't worry about the delayed items because we will + * deal with them in create_pending_snapshot(), which is the + * core function of the snapshot creation. + */ + ret = create_pending_snapshots(trans, root->fs_info); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } - ret = create_pending_snapshots(trans, root->fs_info); + /* + * We insert the dir indexes of the snapshots and update the inode + * of the snapshots' parents after the snapshot creation, so there + * are some delayed items which are not dealt with. Now deal with + * them. + * + * We needn't worry that this operation will corrupt the snapshots, + * because all the tree which are snapshoted will be forced to COW + * the nodes and leaves. + */ + ret = btrfs_run_delayed_items(trans, root); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; -- cgit v1.2.3 From 48c03c4bcfd7a1fcb1e05e9b1db1188cdbecf49a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:03:56 -0600 Subject: Btrfs: fix wrong size for the reservation of the, snapshot creation We should insert/update 6 items(root ref, root backref, dir item, dir index, root item and parent inode) when creating a snapshot, not 5 items, fix it. Signed-off-by: Miao Xie --- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/ioctl.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 36e03312267..b6b33e463ac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4414,10 +4414,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; /* - * two for root back/forward refs, two for directory entries - * and one for root of the snapshot. + * two for root back/forward refs, two for directory entries, + * one for root of the snapshot and one for parent inode. */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); dst_rsv->space_info = src_rsv->space_info; return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e6934de55a8..c56daa36876 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -526,7 +526,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, *inherit = NULL; /* take responsibility to free it */ } - trans = btrfs_start_transaction(root->fs_info->extent_root, 5); + trans = btrfs_start_transaction(root->fs_info->extent_root, 6); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; -- cgit v1.2.3 From 2ecb79239bcd04c9d410f4cdce16adb6840b19da Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:04:27 -0600 Subject: Btrfs: fix unprotected ->log_batch We forget to protect ->log_batch when syncing a file, this patch fix this problem by atomic operation. And ->log_batch is used to check if there are parallel sync operations or not, so it is unnecessary to reset it to 0 after the sync operation of the current log tree complete. Signed-off-by: Miao Xie --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/file.c | 4 ++-- fs/btrfs/tree-log.c | 12 +++++------- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2990a7ea624..6923b9e4f90 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1491,9 +1491,9 @@ struct btrfs_root { wait_queue_head_t log_commit_wait[2]; atomic_t log_writers; atomic_t log_commit[2]; + atomic_t log_batch; unsigned long log_transid; unsigned long last_log_commit; - unsigned long log_batch; pid_t log_start_pid; bool log_multiple_pids; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0dcfb998229..34717ac2416 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1168,8 +1168,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a9d7815cf58..793bc89c660 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1551,9 +1551,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * ordered range does a filemape_write_and_wait_range which is why we * don't do it above like other file systems. */ - root->log_batch++; + atomic_inc(&root->log_batch); btrfs_wait_ordered_range(inode, start, end); - root->log_batch++; + atomic_inc(&root->log_batch); /* * check the transaction that last modified this inode diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f9b0fc91166..038a5229404 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -147,7 +147,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_multiple_pids = true; } - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; @@ -166,7 +166,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, err = ret; } mutex_unlock(&root->fs_info->tree_log_mutex); - root->log_batch++; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return err; @@ -2036,7 +2036,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); while (1) { - unsigned long batch = root->log_batch; + int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); @@ -2044,7 +2044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); } wait_for_writer(trans, root); - if (batch == root->log_batch) + if (batch == atomic_read(&root->log_batch)) break; } @@ -2073,7 +2073,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); - root->log_batch = 0; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; @@ -2086,7 +2085,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); mutex_lock(&log_root_tree->log_mutex); - log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); mutex_unlock(&log_root_tree->log_mutex); @@ -2156,7 +2155,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); - log_root_tree->log_batch = 0; log_root_tree->log_transid++; smp_mb(); -- cgit v1.2.3 From 69ce977a179750915e04fcc12bfbe33e6c8f5132 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:04:44 -0600 Subject: Btrfs: output more information when aborting a unused transaction handle Though we dump the stack information when aborting a unused transaction handle, we don't know the correct place where we decide to abort the transaction handle if one function has several place where the transaction abort function is invoked and jumps to the same place after this call. And beside that we also don't know the reason why we jump to abort the current handle. So I modify the transaction abort function and make it output the function name, line and error information. Signed-off-by: Miao Xie --- fs/btrfs/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 06ff1dd0f9b..867e8e799de 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -248,7 +248,13 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->blocks_used) { - btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); + char nbuf[16]; + const char *errstr; + + errstr = btrfs_decode_error(root->fs_info, errno, nbuf); + btrfs_printk(root->fs_info, + "%s:%d: Aborting unused transaction(%s).\n", + function, line, errstr); return; } trans->transaction->aborted = errno; -- cgit v1.2.3 From 903889f462409c816893abd02d88636f7b4a7774 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 6 Sep 2012 04:04:57 -0600 Subject: Btrfs: fix wrong size for the reservation when doing, file pre-allocation. When we ran fsstress(a program in xfstests), the filesystem hung up when it is full. It was because the space reserved in btrfs_fallocate() was wrong, btrfs_fallocate() just used the size of the pre-allocation to reserve the space, didn't took the block size aligning into account, so the size of the reserved space was less than the allocated space, it caused the over reserve problem and made the filesystem hung up when invoking cow_file_range(). Fix it. Signed-off-by: Miao Xie --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 793bc89c660..00279c9a7f3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2000,7 +2000,7 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, len); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); if (ret) return ret; @@ -2107,7 +2107,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); return ret; } -- cgit v1.2.3 From f54fb859da53f04a443c5e3f4cb9b936ed42d227 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 6 Sep 2012 03:08:59 -0600 Subject: Btrfs: fix error handling in delete_block_group_cache() btrfs_iget() never return NULL. So, NULL check is unnecessary. Signed-off-by: Tsutomu Itoh --- fs/btrfs/relocation.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5a15c96a18f..7e7fd1bcfc5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, root, NULL); - if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { - if (inode && !IS_ERR(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) { + if (!IS_ERR(inode)) iput(inode); return -ENOENT; } -- cgit v1.2.3 From 3d6b5c3b5c0b970ce8a9d3bac6854f5c0ce0295a Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 6 Sep 2012 00:18:10 -0600 Subject: Btrfs: check return value of ulist_alloc() properly ulist_alloc() has the possibility of returning NULL. So, it is necessary to check the return value. Signed-off-by: Tsutomu Itoh --- fs/btrfs/qgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9b707ba5c6c..5039686df6a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1469,6 +1469,10 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * be exceeded */ ulist = ulist_alloc(GFP_ATOMIC); + if (!ulist) { + ret = -ENOMEM; + goto out; + } ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { @@ -1541,6 +1545,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) goto out; ulist = ulist_alloc(GFP_ATOMIC); + if (!ulist) { + btrfs_std_error(fs_info, -ENOMEM); + goto out; + } ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { -- cgit v1.2.3 From 9e8a4a8b0b9484e8d14674fc62c9ad8ac9dbce5b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 5 Sep 2012 19:10:51 -0600 Subject: Btrfs: use flag EXTENT_DEFRAG for snapshot-aware defrag We're going to use this flag EXTENT_DEFRAG to indicate which range belongs to defragment so that we can implement snapshow-aware defrag: We set the EXTENT_DEFRAG flag when dirtying the extents that need defragmented, so later on writeback thread can differentiate between normal writeback and writeback started by defragmentation. Original-Signed-off-by: Li Zefan Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 8 ++++++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 20 ++++++++++++-------- fs/btrfs/ioctl.c | 8 ++++---- 5 files changed, 28 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 19319f5a91a..4a41b17295d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1144,6 +1144,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, NULL, cached_state, mask); } +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 25900af5b15..512f8da041f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -235,6 +235,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int clear_bits, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, int bits); struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 00279c9a7f3..0a4b03d8fcd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1203,8 +1203,8 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state, GFP_NOFS); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, &cached_state, GFP_NOFS); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d34eb329720..9b21cf97cdd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3549,7 +3549,8 @@ again: } clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end, @@ -6061,7 +6062,8 @@ unlock: if (lockstart < lockend) { if (create && len < lockend - lockstart) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, unlock_bits, 1, 0, + lockstart + len - 1, + unlock_bits | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); /* * Beside unlock, we also need to cleanup reserved space @@ -6069,8 +6071,8 @@ unlock: */ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING, - 1, 0, NULL, GFP_NOFS); + unlock_bits | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); } else { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, @@ -6635,8 +6637,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, - &cached_state, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -6652,7 +6654,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -6749,7 +6752,8 @@ again: * prepare_pages in the normal write path. */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c56daa36876..de886ac7ba0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1023,8 +1023,8 @@ again: page_start, page_end - 1, 0, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + &cached_state, GFP_NOFS); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1035,8 +1035,8 @@ again: } - btrfs_set_extent_delalloc(inode, page_start, page_end - 1, - &cached_state); + set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, + &cached_state, GFP_NOFS); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, -- cgit v1.2.3 From dea31f52337c18f19eadfbbccb0c477942dad495 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Sep 2012 16:47:00 -0400 Subject: Btrfs: wait on async pages when shrinking delalloc Mitch reported a problem where you could get an ENOSPC error when untarring a kernel git tree onto a 16gb file system with compress-force=zlib. This is because compression is a huge pain, it will return from ->writepages() without having actually created any ordered extents. To get around this we check to see if the async submit counter is up, and if it is wait until it drops to 0 before doing our normal ordered wait dance. With this patch I can now untar a kernel git tree onto a 16gb file system without getting ENOSPC errors. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b6b33e463ac..06369d588f2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3697,6 +3697,13 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, WB_REASON_FS_FREE_SPACE); + /* + * We need to wait for the async pages to actually start before + * we do anything. + */ + wait_event(root->fs_info->async_submit_wait, + !atomic_read(&root->fs_info->async_delalloc_pages)); + spin_lock(&space_info->lock); if (space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + -- cgit v1.2.3 From a80c8dcf7e5065adc555ef8ffb256df11e3293e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Sep 2012 16:59:33 -0400 Subject: Btrfs: fix our overcommit math I noticed I was seeing large lags when running my torrent test in a vm on my laptop. While trying to make it lag less I noticed that our overcommit math was taking into account the number of bytes we wanted to reclaim, not the number of bytes we actually wanted to allocate, which means we wouldn't overcommit as often. This patch fixes the overcommit math and makes shrink_delalloc() use that logic so that it will stop looping faster. We still have pretty high spikes of latency, but the test now takes 3 minutes less time (about 5% faster). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 29 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 06369d588f2..a8de1c30f21 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3663,6 +3663,46 @@ out: return ret; } +static int can_overcommit(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 bytes, + int flush) +{ + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + u64 used; + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + spin_unlock(&root->fs_info->free_chunk_lock); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + /* * shrink metadata reservation for delalloc */ @@ -3705,10 +3745,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, !atomic_read(&root->fs_info->async_delalloc_pages)); spin_lock(&space_info->lock); - if (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use + orig <= - space_info->total_bytes) { + if (can_overcommit(root, space_info, orig, !trans)) { spin_unlock(&space_info->lock); break; } @@ -3924,7 +3961,6 @@ again: } if (ret) { - u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; /* @@ -3945,30 +3981,7 @@ again: goto again; } - spin_lock(&root->fs_info->free_chunk_lock); - avail = root->fs_info->free_chunk_space; - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. - */ - if (profile & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - avail >>= 1; - - /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. - */ - if (flush) - avail >>= 3; - else - avail >>= 1; - spin_unlock(&root->fs_info->free_chunk_lock); - - if (used + num_bytes < space_info->total_bytes + avail) { + if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, 1); -- cgit v1.2.3 From 837e197283199de640857192ca32767cb6e24fe8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Sep 2012 03:00:48 -0600 Subject: btrfs: polish names of kmem caches Usecase: watch 'grep btrfs < /proc/slabinfo' easy to watch all caches in one go. Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_map.c | 2 +- fs/btrfs/inode.c | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index eb768c4c135..b26d2f9d88d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("delayed_node", + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4a41b17295d..3ad84f50068 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,13 +64,13 @@ tree_fs_info(struct extent_io_tree *tree) int __init extent_io_init(void) { - extent_state_cache = kmem_cache_create("extent_state", + extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = kmem_cache_create("extent_buffers", + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ac606f076eb..8d1364d385d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = kmem_cache_create("extent_map", + extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b21cf97cdd..aae55625aa5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7155,31 +7155,31 @@ void btrfs_destroy_cachep(void) int btrfs_init_cachep(void) { - btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) -- cgit v1.2.3 From 8407aa464331556e4f6784f974030b83fc7585ed Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 7 Sep 2012 01:43:32 -0600 Subject: Btrfs: fix corrupted metadata in the snapshot When we delete a inode, we will remove all the delayed items including delayed inode update, and then truncate all the relative metadata. If there is lots of metadata, we will end the current transaction, and start a new transaction to truncate the left metadata. In this way, we will leave a inode item that its link counter is > 0, and also may leave some directory index items in fs/file tree after the current transaction ends. In other words, the metadata in this fs/file tree is inconsistent. If we create a snapshot for this tree now, we will find a inode with corrupted metadata in the new snapshot, and we won't continue to drop the left metadata, because its link counter is not 0. We fix this problem by updating the inode item before the current transaction ends. Signed-off-by: Miao Xie --- fs/btrfs/inode.c | 19 +++++++++---------- fs/btrfs/transaction.c | 29 +++++++++++++++++++++-------- fs/btrfs/transaction.h | 2 ++ 3 files changed, 32 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aae55625aa5..9e9754adf7a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3834,15 +3834,10 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(inode, 0); /* - * This is a bit simpler than btrfs_truncate since - * - * 1) We've already reserved our space for our orphan item in the - * unlink. - * 2) We're going to delete the inode item, so we don't need to update - * it at all. - * - * So we just need to reserve some slack space in case we add bytes when - * doing the truncate. + * This is a bit simpler than btrfs_truncate since we've already + * reserved our space for our orphan item in the unlink, so we just + * need to reserve some slack space in case we add bytes and update + * inode item when doing the truncate. */ while (1) { ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); @@ -3863,7 +3858,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction_noflush(root, 1); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); @@ -3876,6 +3871,10 @@ void btrfs_evict_inode(struct inode *inode) if (ret != -ENOSPC) break; + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a86fc723aad..f8ae448ebec 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -290,7 +290,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - u64 num_items, int type) + u64 num_items, int type, + int noflush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -324,9 +325,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, - num_bytes); + if (noflush) + ret = btrfs_block_rsv_add_noflush(root, + &root->fs_info->trans_block_rsv, + num_bytes); + else + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes); if (ret) return ERR_PTR(ret); } @@ -393,21 +399,28 @@ got_it: struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START); + return start_transaction(root, num_items, TRANS_START, 0); } + +struct btrfs_trans_handle *btrfs_start_transaction_noflush( + struct btrfs_root *root, int num_items) +{ + return start_transaction(root, num_items, TRANS_START, 1); +} + struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN); + return start_transaction(root, 0, TRANS_JOIN, 0); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_NOLOCK); + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_USERSPACE); + return start_transaction(root, 0, TRANS_USERSPACE, 0); } /* wait for a transaction commit to be fully complete */ diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1a138bfb8f1..1b054800eca 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -97,6 +97,8 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_noflush( + struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); -- cgit v1.2.3 From 0647d6bd16c03c928a22d1b90c4c8684c5908f08 Mon Sep 17 00:00:00 2001 From: liubo Date: Fri, 7 Sep 2012 20:01:26 -0600 Subject: Btrfs: cleanup for unused ref cache stuff As ref cache has been removed from btrfs, there is no user on its lock and its check. Signed-off-by: Liu Bo Signed-off-by: Liu Bo --- fs/btrfs/ctree.h | 3 --- fs/btrfs/disk-io.c | 5 ----- 2 files changed, 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6923b9e4f90..305002b7bf3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1376,9 +1376,6 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; - spinlock_t ref_cache_lock; - u64 total_ref_cache_size; - /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34717ac2416..8054f7ccf46 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2000,7 +2000,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); @@ -3214,10 +3213,6 @@ int close_ctree(struct btrfs_root *root) printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); } - if (fs_info->total_ref_cache_size) { - printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", - (unsigned long long)fs_info->total_ref_cache_size); - } free_extent_buffer(fs_info->extent_root->node); free_extent_buffer(fs_info->extent_root->commit_root); -- cgit v1.2.3 From 69917e431210f8712fe050f47b7561e7dae89521 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 7 Sep 2012 20:01:28 -0600 Subject: Btrfs: fix a bug in parsing return value in logical resolve In logical resolve, we parse extent_from_logical()'s 'ret' as a kind of flag. It is possible to lose our errors because (-EXXXX & BTRFS_EXTENT_FLAG_TREE_BLOCK) is true. I'm not sure if it is on purpose, it just looks too hacky if it is. I'd rather use a real flag and a 'ret' to catch errors. Acked-by: Jan Schmidt Signed-off-by: Liu Bo --- fs/btrfs/backref.c | 24 ++++++++++++++++-------- fs/btrfs/backref.h | 3 ++- fs/btrfs/ioctl.c | 6 ++++-- fs/btrfs/scrub.c | 14 ++++++++------ fs/btrfs/send.c | 7 ++++--- 5 files changed, 34 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e600857d3ca..ab286e5bfe3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1193,7 +1193,8 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, * tree blocks and <0 on error. */ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key) + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags_ret) { int ret; u64 flags; @@ -1237,10 +1238,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, (unsigned long long)found_key->objectid, (unsigned long long)found_key->offset, (unsigned long long)flags, item_size); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - return BTRFS_EXTENT_FLAG_TREE_BLOCK; - if (flags & BTRFS_EXTENT_FLAG_DATA) - return BTRFS_EXTENT_FLAG_DATA; + + WARN_ON(!flags_ret); + if (flags_ret) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else if (flags & BTRFS_EXTENT_FLAG_DATA) + *flags_ret = BTRFS_EXTENT_FLAG_DATA; + else + BUG_ON(1); + return 0; + } return -EIO; } @@ -1433,15 +1441,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, { int ret; u64 extent_item_pos; + u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; - ret = extent_from_logical(fs_info, logical, path, - &found_key); + ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); btrfs_release_path(path); if (ret < 0) return ret; - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; extent_item_pos = logical - found_key.objectid; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 032f4dc7eab..4fda5d8029a 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,7 +40,8 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path); int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key); + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_extent_item *ei, u32 item_size, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de886ac7ba0..48ff3f7317b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3211,6 +3211,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, int ret = 0; int size; u64 extent_item_pos; + u64 flags = 0; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; @@ -3240,10 +3241,11 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + ret = extent_from_logical(root->fs_info, loi->logical, path, &key, + &flags); btrfs_release_path(path); - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -ENOENT; if (ret < 0) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4e9eafe01c5..d3bb901ade4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; - u32 item_size; - int ret; + unsigned long ptr = 0; + u64 extent_item_pos; + u64 flags = 0; u64 ref_root; + u32 item_size; u8 ref_level; - unsigned long ptr = 0; const int bufsize = 4096; - u64 extent_item_pos; + int ret; path = btrfs_alloc_path(); @@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (!path || !swarn.scratch_buf || !swarn.msg_buf) goto out; - ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, + &flags); if (ret < 0) goto out; @@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) item_size = btrfs_item_size_nr(eb, path->slots[0]); btrfs_release_path(path); - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e5c867996aa..c6ef070a8dc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1152,6 +1152,7 @@ static int find_extent_clone(struct send_ctx *sctx, u64 disk_byte; u64 num_bytes; u64 extent_item_pos; + u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; struct backref_ctx *backref_ctx = NULL; @@ -1198,13 +1199,13 @@ static int find_extent_clone(struct send_ctx *sctx, } logical = disk_byte + btrfs_file_extent_offset(eb, fi); - ret = extent_from_logical(sctx->send_root->fs_info, - disk_byte, tmp_path, &found_key); + ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, + &found_key, &flags); btrfs_release_path(tmp_path); if (ret < 0) goto out; - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = -EIO; goto out; } -- cgit v1.2.3 From df031f0752d50f2061df2847d57ea52a79f7977c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 7 Sep 2012 20:01:29 -0600 Subject: Btrfs: use helper for logical resolve We already have a helper, iterate_inodes_from_logical(), for logical resolve, so just use it. Signed-off-by: Liu Bo --- fs/btrfs/ioctl.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 48ff3f7317b..fa1284d596e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3210,12 +3210,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, { int ret = 0; int size; - u64 extent_item_pos; - u64 flags = 0; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; - struct btrfs_key key; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3241,23 +3238,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - ret = extent_from_logical(root->fs_info, loi->logical, path, &key, - &flags); - btrfs_release_path(path); - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, + build_ino_list, inodes); + if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; - extent_item_pos = loi->logical - key.objectid; - ret = iterate_extent_inodes(root->fs_info, key.objectid, - extent_item_pos, 0, build_ino_list, - inodes); - - if (ret < 0) - goto out; - ret = copy_to_user((void *)(unsigned long)loi->inodes, (void *)(unsigned long)inodes, size); if (ret) -- cgit v1.2.3 From 425d17a290c0c63785ec65db154a95c6337aeefa Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 7 Sep 2012 20:01:30 -0600 Subject: Btrfs: use larger limit for translation of logical to inode This is the change of the kernel side. Translation of logical to inode used to have an upper limit 4k on inode container's size, but the limit is not large enough for a data with a great many of refs, so when resolving logical address, we can end up with "ioctl ret=0, bytes_left=0, bytes_missing=19944, cnt=510, missed=2493" This changes to regard 64k as the upper limit and use vmalloc instead of kmalloc to get memory more easily. Signed-off-by: Josef Bacik Signed-off-by: Liu Bo --- fs/btrfs/backref.c | 5 +++-- fs/btrfs/ioctl.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ab286e5bfe3..7084431b7c9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include #include "ctree.h" #include "disk-io.h" #include "backref.h" @@ -1584,7 +1585,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = kmalloc(alloc_bytes, GFP_NOFS); + data = vmalloc(alloc_bytes); if (!data) return ERR_PTR(-ENOMEM); @@ -1635,6 +1636,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - kfree(ipath->fspath); + vfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fa1284d596e..38e3d6ab89d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3230,7 +3230,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - size = min_t(u32, loi->size, 4096); + size = min_t(u32, loi->size, 64 * 1024); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -3252,7 +3252,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, out: btrfs_free_path(path); - kfree(inodes); + vfree(inodes); kfree(loi); return ret; -- cgit v1.2.3 From 6df7881a84013f91405e5e113a4c322dd1804ba6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Sep 2012 08:08:30 -0600 Subject: Btrfs: move the sb_end_intwrite until after the throttle logic Sage reported the following lockdep backtrace ===================================== [ BUG: bad unlock balance detected! ] 3.6.0-rc2-ceph-00171-gc7ed62d #1 Not tainted ------------------------------------- btrfs-cleaner/7607 is trying to release lock (sb_internal) at: [] btrfs_commit_transaction+0xa6e/0xb20 [btrfs] but there are no more locks to release! other info that might help us debug this: 1 lock held by btrfs-cleaner/7607: #0: (&fs_info->cleaner_mutex){+.+...}, at: [] cleaner_kthread+0x95/0x120 [btrfs] stack backtrace: Pid: 7607, comm: btrfs-cleaner Not tainted 3.6.0-rc2-ceph-00171-gc7ed62d #1 Call Trace: [] ? btrfs_commit_transaction+0xa6e/0xb20 [btrfs] [] print_unlock_inbalance_bug+0xfe/0x110 [] lock_release_non_nested+0x1ee/0x310 [] ? kmem_cache_free+0x7b/0x160 [] ? put_transaction+0x8c/0x130 [btrfs] [] ? btrfs_commit_transaction+0xa6e/0xb20 [btrfs] [] lock_release+0xd5/0x220 [] ? kmem_cache_free+0x151/0x160 [] __sb_end_write+0x7d/0x90 [] btrfs_commit_transaction+0xa6e/0xb20 [btrfs] [] ? __init_waitqueue_head+0x60/0x60 [] ? _raw_spin_unlock+0x2b/0x40 [] __btrfs_end_transaction+0x368/0x3c0 [btrfs] [] btrfs_end_transaction_throttle+0x18/0x20 [btrfs] [] btrfs_drop_snapshot+0x410/0x600 [btrfs] [] ? do_raw_spin_unlock+0x5d/0xb0 [] btrfs_clean_old_snapshots+0xaf/0x150 [btrfs] [] ? cleaner_kthread+0x95/0x120 [btrfs] [] cleaner_kthread+0xa9/0x120 [btrfs] [] ? btrfs_destroy_delayed_refs.isra.102+0x220/0x220 [btrfs] [] kthread+0xae/0xc0 [] ? trace_hardirqs_on+0xd/0x10 [] kernel_thread_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? flush_kthread_work+0x1a0/0x1a0 [] ? gs_change+0x13/0x13 This is because the throttle stuff can commit the transaction, which expects to be the one stopping the intwrite stuff, but we've already done it in the __btrfs_end_transaction. Moving the sb_end_intewrite after this logic makes the lockdep go away. Thanks, Tested-by: Sage Weil Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f8ae448ebec..0629edf9910 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -564,8 +564,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - sb_end_intwrite(root->fs_info->sb); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -586,6 +584,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } + sb_end_intwrite(root->fs_info->sb); + WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); -- cgit v1.2.3 From 69ffb54347a71c4f2a4694e0c1155af8538d55f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 11 Sep 2012 15:40:07 -0400 Subject: Btrfs: create a pinned em when writing to a prealloc range in DIO Wade Cline reported a problem where he was getting garbage and warnings when writing to a preallocated range via O_DIRECT. This is because we weren't creating our normal pinned extent_map for the range we were writing to, which was causing all sorts of issues. This patch fixes the problem and makes his testcase much happier. Thanks, Reported-by: Wade Cline Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9e9754adf7a..406666cb615 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5898,6 +5898,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, return ret; } +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + int type) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(); + if (!em) + return ERR_PTR(-ENOMEM); + + em->start = start; + em->orig_start = orig_start; + em->len = len; + em->block_len = block_len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + if (type == BTRFS_ORDERED_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + + do { + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + } while (ret == -EEXIST); + + if (ret) { + free_extent_map(em); + return ERR_PTR(ret); + } + + return em; +} + + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -6012,6 +6054,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { + u64 orig_start = em->start; + + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + em = create_pinned_em(inode, start, len, + orig_start, + block_start, len, type); + if (IS_ERR(em)) { + btrfs_end_transaction(trans, root); + goto unlock_err; + } + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, len, type); btrfs_end_transaction(trans, root); -- cgit v1.2.3 From 962197babeccc1f4cc8aa28ad844df80bdc85ed0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 11 Sep 2012 23:27:35 -0600 Subject: Btrfs: fix unnecessary warning when the fragments make the space alloc fail When we wrote some data by compress mode into a btrfs filesystem which was full of the fragments, the kernel will report: BTRFS warning (device xxx): Aborting unused transaction. The reason is: We can not find a long enough free space to store the compressed data because of the fragmentary free space, and the compressed data can not be splited, so the kernel outputed the above message. In fact, btrfs can deal with this problem very well: it fall back to uncompressed IO, split the uncompressed data into small ones, and then store them into to the fragmentary free space. So we shouldn't output the above warning message. Signed-off-by: Miao Xie --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 406666cb615..d41e467b4aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -662,7 +662,7 @@ retry: async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1); - if (ret) + if (ret && ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); } -- cgit v1.2.3 From be3940c0a90265654d778394cafe2e2cec674df8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Sep 2012 14:23:05 -0600 Subject: btrfs: Kill some bi_idx references For immutable bio vecs, I've been auditing and removing bi_idx references. These were harmless, but removing them will make auditing easier. scrub_bio_end_io_worker() was open coding a bio_reset() - but this doesn't appear to have been needed for anything as right after it does a bio_put(), and perusing the code it doesn't appear anything else was holding a reference to the bio. The other use end_bio_extent_readpage() was just for a pr_debug() - changed it to something that might be a bit more useful. Signed-off-by: Kent Overstreet CC: Chris Mason CC: Stefan Behrens --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/scrub.c | 15 --------------- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3ad84f50068..90bd9f768c0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2306,8 +2306,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; - pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " - "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " + "mirror=%ld\n", (u64)bio->bi_sector, err, (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d3bb901ade4..27892f67e69 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1669,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) scrub_block_put(sblock); } - if (sbio->err) { - /* what is this good for??? */ - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; - - for (i = 0; i < sbio->page_count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - } - bio_put(sbio->bio); sbio->bio = NULL; spin_lock(&sdev->list_lock); -- cgit v1.2.3 From ea658badc47e614e38ab4d98510488474c7e6d4b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 11 Sep 2012 16:57:25 -0400 Subject: Btrfs: delay block group item insertion So we have lots of places where we try to preallocate chunks in order to make sure we have enough space as we make our allocations. This has historically meant that we're constantly tweaking when we should allocate a new chunk, and historically we have gotten this horribly wrong so we way over allocate either metadata or data. To try and keep this from happening we are going to make it so that the block group item insertion is done out of band at the end of a transaction. This will allow us to create chunks even if we are trying to make an allocation for the extent tree. With this patch my enospc tests run faster (didn't expect this) and more efficiently use the disk space (this is what I wanted). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 5 ++ fs/btrfs/extent-tree.c | 130 ++++++++++++++++++++++++------------------------- fs/btrfs/transaction.c | 10 ++++ fs/btrfs/transaction.h | 1 + 4 files changed, 79 insertions(+), 67 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 305002b7bf3..d66dc1c6a40 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1137,6 +1137,9 @@ struct btrfs_block_group_cache { * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; + + /* For delayed block group creation */ + struct list_head new_bg_list; }; /* delayed seq elem */ @@ -2865,6 +2868,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8de1c30f21..254fd3289f4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2361,10 +2361,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } next: - do_chunk_alloc(trans, fs_info->extent_root, - 2 * 1024 * 1024, - btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -2478,10 +2474,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); delayed_refs = &trans->transaction->delayed_refs; @@ -2551,6 +2543,12 @@ again: } if (run_all) { + if (!list_empty(&trans->new_bgs)) { + spin_unlock(&delayed_refs->lock); + btrfs_create_pending_block_groups(trans, root); + spin_lock(&delayed_refs->lock); + } + node = rb_first(&delayed_refs->root); if (!node) goto out; @@ -3826,7 +3824,8 @@ enum flush_state { FLUSH_DELALLOC_WAIT = 2, FLUSH_DELAYED_ITEMS_NR = 3, FLUSH_DELAYED_ITEMS = 4, - COMMIT_TRANS = 5, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, }; static int flush_space(struct btrfs_root *root, @@ -3863,6 +3862,20 @@ static int flush_space(struct btrfs_root *root, ret = btrfs_run_delayed_items_nr(trans, root, nr); btrfs_end_transaction(trans, root); break; + case ALLOC_CHUNK: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret == -ENOSPC) + ret = 0; + break; case COMMIT_TRANS: ret = may_commit_transaction(root, space_info, orig_bytes, 0); break; @@ -5515,8 +5528,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; - int allowed_chunk_alloc = 0; - int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int loop = 0; int index = 0; @@ -5548,9 +5559,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (btrfs_mixed_space_info(space_info)) use_cluster = false; - if (orig_root->ref_cows || empty_size) - allowed_chunk_alloc = 1; - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) @@ -5860,34 +5868,18 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - /* - * Do not bail out on ENOSPC since we - * can do more things. - */ - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, - root, ret); - goto out; - } - allowed_chunk_alloc = 0; - if (ret == 1) - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == - CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, + CHUNK_ALLOC_FORCE); + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, + root, ret); + goto out; } - - /* - * We didn't allocate a chunk, go ahead and drop the - * empty size and loop again. - */ - if (!done_chunk_alloc) - loop = LOOP_NO_EMPTY_SIZE; } if (loop == LOOP_NO_EMPTY_SIZE) { @@ -5962,20 +5954,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, data = btrfs_get_alloc_profile(root, data); again: - /* - * the only place that sets empty_size is btrfs_realloc_node, which - * is not called recursively on allocations - */ - if (empty_size || root->ref_cows) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, - CHUNK_ALLOC_NO_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - } - WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, hint_byte, ins, data); @@ -5985,12 +5963,6 @@ again: num_bytes = num_bytes >> 1; num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -7828,6 +7800,34 @@ error: return ret; } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, + new_bg_list) { + list_del_init(&block_group->new_bg_list); + + if (ret) + continue; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + } +} + int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -7861,6 +7861,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); btrfs_init_free_space_ctl(cache); @@ -7892,12 +7893,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); /* Logic error */ - ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, - sizeof(cache->item)); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - return ret; - } + list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0629edf9910..c01dec70c96 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -374,6 +374,7 @@ again: h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; INIT_LIST_HEAD(&h->qgroup_ref_list); + INIT_LIST_HEAD(&h->new_bgs); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -549,6 +550,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->qgroup_reserved = 0; } + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -564,6 +568,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1400,6 +1407,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ cur_trans->delayed_refs.flushing = 1; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + ret = btrfs_run_delayed_refs(trans, root, 0); if (ret) goto cleanup_transaction; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1b054800eca..b6463e12804 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -68,6 +68,7 @@ struct btrfs_trans_handle { struct btrfs_root *root; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; + struct list_head new_bgs; }; struct btrfs_pending_snapshot { -- cgit v1.2.3 From 698d0082c4875a2ccc10b52ee8f415faad46b754 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Sep 2012 14:08:47 -0400 Subject: Btrfs: remove bytes argument from do_chunk_alloc Everybody is just making stuff up, and it's just used to see if we really do need to alloc a chunk, and since we do this when we already know we really do it's just a waste of space. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 254fd3289f4..0bc83b3f33d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force); + struct btrfs_root *extent_root, u64 flags, + int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -3404,7 +3404,6 @@ alloc: return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, alloc_target, CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); @@ -3486,8 +3485,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes, - int force) + struct btrfs_space_info *sinfo, int force) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3518,7 +3516,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } - if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) + if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3568,8 +3566,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, } static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) + struct btrfs_root *extent_root, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; @@ -3593,7 +3590,7 @@ again: return 0; } - if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { + if (!should_alloc_chunk(extent_root, space_info, force)) { spin_unlock(&space_info->lock); return 0; } else if (space_info->chunk_alloc) { @@ -3869,7 +3866,6 @@ static int flush_space(struct btrfs_root *root, break; } ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); @@ -5868,8 +5864,7 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, + ret = do_chunk_alloc(trans, root, data, CHUNK_ALLOC_FORCE); /* * Do not bail out on ENOSPC since we @@ -7269,7 +7264,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, alloc_flags = update_block_group_flags(root, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7279,7 +7274,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7293,7 +7288,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type) { u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + return do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); } -- cgit v1.2.3 From 90abccf2c6e6e9c5a5d519eaed95292afa30aa11 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 13 Sep 2012 04:53:47 -0600 Subject: Revert "Btrfs: do not do filemap_write_and_wait_range in fsync" This reverts commit 0885ef5b5601e9b007c383e77c172769b1f214fd After applying the above patch, the performance slowed down because the dirty page flush can only be done by one task, so revert it. The following is the test result of sysbench: Before After 24MB/s 39MB/s Signed-off-by: Miao Xie --- fs/btrfs/file.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0a4b03d8fcd..d0fc4c5aaf1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1544,12 +1544,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_btrfs_sync_file(file, datasync); + /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); /* - * we wait first, since the writeback may change the inode, also wait - * ordered range does a filemape_write_and_wait_range which is why we - * don't do it above like other file systems. + * We flush the dirty pages again to avoid some dirty pages in the + * range being left. */ atomic_inc(&root->log_batch); btrfs_wait_ordered_range(inode, start, end); -- cgit v1.2.3 From 99621b44aa194eab594e1f17217231c02b519211 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Aug 2012 16:31:33 -0400 Subject: btrfs: reada_extent doesn't need kref for refcount All increments and decrements are under the same spinlock - have to be, since they need to protect the radix_tree it's found in. Just use int, no need to wank with kref... Signed-off-by: Al Viro --- fs/btrfs/reada.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 48a4882d8ad..a955669519a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -68,7 +68,7 @@ struct reada_extent { u32 blocksize; int err; struct list_head extctl; - struct kref refcnt; + int refcnt; spinlock_t lock; struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; @@ -126,7 +126,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) - kref_get(&re->refcnt); + re->refcnt++; spin_unlock(&fs_info->reada_lock); if (!re) @@ -336,7 +336,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) - kref_get(&re->refcnt); + re->refcnt++; spin_unlock(&fs_info->reada_lock); if (re) @@ -352,7 +352,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, re->top = *top; INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); - kref_init(&re->refcnt); + re->refcnt = 1; /* * map block @@ -398,7 +398,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); BUG_ON(!re_exist); - kref_get(&re_exist->refcnt); + re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); goto error; } @@ -465,10 +465,6 @@ error: return re_exist; } -static void reada_kref_dummy(struct kref *kr) -{ -} - static void reada_extent_put(struct btrfs_fs_info *fs_info, struct reada_extent *re) { @@ -476,7 +472,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, unsigned long index = re->logical >> PAGE_CACHE_SHIFT; spin_lock(&fs_info->reada_lock); - if (!kref_put(&re->refcnt, reada_kref_dummy)) { + if (--re->refcnt) { spin_unlock(&fs_info->reada_lock); return; } @@ -671,7 +667,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, return 0; } dev->reada_next = re->logical + re->blocksize; - kref_get(&re->refcnt); + re->refcnt++; spin_unlock(&fs_info->reada_lock); -- cgit v1.2.3 From 8c0a85377048b64c880e76ec7368904fe46d0b94 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 26 Sep 2012 11:33:07 +1000 Subject: fs: push rcu_barrier() from deactivate_locked_super() to filesystems There's no reason to call rcu_barrier() on every deactivate_locked_super(). We only need to make sure that all delayed rcu free inodes are flushed before we destroy related cache. Removing rcu_barrier() from deactivate_locked_super() affects some fast paths. E.g. on my machine exit_group() of a last process in IPC namespace takes 0.07538s. rcu_barrier() takes 0.05188s of that time. Signed-off-by: Kirill A. Shutemov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/btrfs/extent_io.c | 6 ++++++ fs/btrfs/inode.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4c878476bb9..b08ea4717e9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -107,6 +107,12 @@ void extent_io_exit(void) list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } + + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); if (extent_state_cache) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec154f95464..cf03a91d806 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7076,6 +7076,11 @@ static void init_once(void *foo) void btrfs_destroy_cachep(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); if (btrfs_inode_cachep) kmem_cache_destroy(btrfs_inode_cachep); if (btrfs_trans_handle_cachep) -- cgit v1.2.3 From 0433f20d436fd040013e5a419a2e3d732d618a41 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 13 Sep 2012 03:32:32 -0600 Subject: Btrfs: cleanup of error processing in btree_get_extent() This patch simplifies a little complex error processing in btree_get_extent(). Signed-off-by: Tsutomu Itoh --- fs/btrfs/disk-io.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8054f7ccf46..8db87bc53d2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -222,21 +222,17 @@ static struct extent_map *btree_get_extent(struct inode *inode, free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); - if (em) { - ret = 0; - } else { - em = lookup_extent_mapping(em_tree, failed_start, - failed_len); - ret = -EIO; + if (!em) { + lookup_extent_mapping(em_tree, failed_start, + failed_len); + em = ERR_PTR(-EIO); } } else if (ret) { free_extent_map(em); - em = NULL; + em = ERR_PTR(ret); } write_unlock(&em_tree->lock); - if (ret) - em = ERR_PTR(ret); out: return em; } -- cgit v1.2.3 From b4f359ab065b9c6c132124557d23e90e2e8504be Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 13 Sep 2012 03:32:54 -0600 Subject: Btrfs: remove unnecessary code in btree_get_extent() Unnecessary lookup_extent_mapping() is removed because an error is returned to the caller. This patch was made based on the advice from Stefan Behrens, thanks. Signed-off-by: Tsutomu Itoh --- fs/btrfs/disk-io.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8db87bc53d2..8d633e3e391 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -217,16 +217,10 @@ static struct extent_map *btree_get_extent(struct inode *inode, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { - u64 failed_start = em->start; - u64 failed_len = em->len; - free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - lookup_extent_mapping(em_tree, failed_start, - failed_len); + if (!em) em = ERR_PTR(-EIO); - } } else if (ret) { free_extent_map(em); em = ERR_PTR(ret); -- cgit v1.2.3 From 1bcea35597693b3ac1ec1b311cfd42d52972a710 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 14 Sep 2012 00:04:21 -0600 Subject: Btrfs: write_buf is now callable outside send.c Developing service cmds needs it. Signed-off-by: Anand Jain --- fs/btrfs/send.c | 11 ++++++----- fs/btrfs/send.h | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c6ef070a8dc..c7beb543a4a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -386,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void) return path; } -static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) +int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) { int ret; mm_segment_t old_fs; @@ -396,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len) set_fs(KERNEL_DS); while (pos < len) { - ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos, - &sctx->send_off); + ret = vfs_write(filp, (char *)buf + pos, len - pos, off); /* TODO handle that correctly */ /*if (ret == -ERESTARTSYS) { continue; @@ -553,7 +552,8 @@ static int send_header(struct send_ctx *sctx) strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - return write_buf(sctx, &hdr, sizeof(hdr)); + return write_buf(sctx->send_filp, &hdr, sizeof(hdr), + &sctx->send_off); } /* @@ -590,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx) crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); - ret = write_buf(sctx, sctx->send_buf, sctx->send_size); + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); sctx->total_send_size += sctx->send_size; sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9934e948e57..1bf4f32fd4e 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -130,4 +130,5 @@ enum { #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off); #endif -- cgit v1.2.3 From ebb3dad4353b94c12a5cffab4397727c19f088e5 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 13 Sep 2012 20:29:02 -0600 Subject: Btrfs: using for_each_set_bit_from to simplify the code Using for_each_set_bit_from() to simplify the code. spatch with a semantic match is used to found this. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun --- fs/btrfs/free-space-cache.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6b10acfc2f5..b107e68797f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); - for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); if ((next_zero - i) >= bits) { @@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, again: found_bits = 0; - for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { -- cgit v1.2.3 From b3ae244e7174d981c09ad7a6a68e7909d600aaca Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 13 Sep 2012 16:04:34 -0600 Subject: btrfs: return EPERM upon rmdir on a subvolume A subvolume cannot be deleted via rmdir, but the error code ENOTEMPTY is confusing. Return EPERM instead, as this is not permitted. Signed-off-by: David Sterba --- fs/btrfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d41e467b4aa..1c50f7c4f5a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3200,9 +3200,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; unsigned long nr = 0; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || - btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) + return -EPERM; trans = __unlink_start_trans(dir, dentry); if (IS_ERR(trans)) -- cgit v1.2.3 From 60376ce4a8396bc5cd777be05b6a9bf044520f42 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 10:34:40 -0400 Subject: Btrfs: fix race in sync and freeze again I screwed this up, there is a race between checking if there is a running transaction and actually starting a transaction in sync where we could race with a freezer and get ourselves into trouble. To fix this we need to make a new join type to only do the try lock on the freeze stuff. If it fails we'll return EPERM and just return from sync. This fixes a hang Liu Bo reported when running xfstest 68 in a loop. Thanks, Reported-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 15 ++++++--------- fs/btrfs/transaction.c | 12 +++++++++++- fs/btrfs/transaction.h | 1 + 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 867e8e799de..903ab2d7068 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -854,16 +854,13 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); - spin_lock(&fs_info->trans_lock); - if (!fs_info->running_transaction) { - spin_unlock(&fs_info->trans_lock); - return 0; - } - spin_unlock(&fs_info->trans_lock); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + trans = btrfs_join_transaction_freeze(root); + if (IS_ERR(trans)) { + /* Frozen, don't bother */ + if (PTR_ERR(trans) == -EPERM) + return 0; return PTR_ERR(trans); + } return btrfs_commit_transaction(trans, root); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c01dec70c96..e4bfac8d54b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -272,6 +272,7 @@ enum btrfs_trans_type { TRANS_JOIN, TRANS_USERSPACE, TRANS_JOIN_NOLOCK, + TRANS_JOIN_FREEZE, }; static int may_wait_transaction(struct btrfs_root *root, int type) @@ -341,7 +342,11 @@ again: if (!h) return ERR_PTR(-ENOMEM); - sb_start_intwrite(root->fs_info->sb); + if (!__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) { + if (type == TRANS_JOIN_FREEZE) + return ERR_PTR(-EPERM); + sb_start_intwrite(root->fs_info->sb); + } if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -424,6 +429,11 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } +struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0); +} + /* wait for a transaction commit to be fully complete */ static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b6463e12804..fbf8313b9d6 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -102,6 +102,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush( struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From ab26e9d6c8a760aa5c4518eb2cb1a85516b3a4f7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 14 Sep 2012 02:58:05 -0600 Subject: Btrfs: cleanup for duplicated code in find_free_extent There is already an 'add free space' phrase in front of this one, we needn't to redo it. Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0bc83b3f33d..27a6b3e6fa4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5828,10 +5828,6 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); if (used_block_group != block_group) btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); -- cgit v1.2.3 From 2e90cf858fd2639eae2fb3f2173f7633cb15bf38 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 14 Sep 2012 02:58:06 -0600 Subject: Btrfs: cleanup fs_info->hashers fs_info->hashers is now an obsolete one. Signed-off-by: Liu Bo --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - 2 files changed, 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d66dc1c6a40..1372057e1ec 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1253,7 +1253,6 @@ struct btrfs_fs_info { struct mutex reloc_mutex; struct list_head trans_list; - struct list_head hashers; struct list_head dead_roots; struct list_head caching_block_groups; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8d633e3e391..5d3f813e13a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1984,7 +1984,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); -- cgit v1.2.3 From 6bbe3a9c805fcb8cd8d396dafd32078181a7cdd5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 14 Sep 2012 02:58:07 -0600 Subject: Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents nocow_only is now an obsolete argument. Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/ordered-data.c | 12 +----------- fs/btrfs/ordered-data.h | 3 +-- fs/btrfs/relocation.c | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 2 +- 6 files changed, 7 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 27a6b3e6fa4..a3a902fdeb4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3722,7 +3722,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); return; } @@ -3748,7 +3748,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); } else { time_left = schedule_timeout_killable(1); if (time_left) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e2b3d994ec0..7772f02ba28 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -468,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -484,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, cur = splice.next; ordered = list_entry(cur, struct btrfs_ordered_extent, root_extent_list); - if (nocow_only && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - list_move(&ordered->root_extent_list, - &root->fs_info->ordered_extents); - cond_resched_lock(&root->fs_info->ordered_extent_lock); - continue; - } - list_del_init(&ordered->root_extent_list); atomic_inc(&ordered->refs); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d1ddaeff135..dd27a0b46a3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -190,8 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); int __init ordered_data_init(void); void ordered_data_exit(void); #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7e7fd1bcfc5..6e530bb86c9 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4058,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->flags); btrfs_start_delalloc_inodes(fs_info->tree_root, 0); - btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 903ab2d7068..5aa3b8182d9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -852,7 +852,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); trans = btrfs_join_transaction_freeze(root); if (IS_ERR(trans)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e4bfac8d54b..c9265a60348 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1475,7 +1475,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); - btrfs_wait_ordered_extents(root, 0, 1); + btrfs_wait_ordered_extents(root, 1); } ret = btrfs_run_delayed_items(trans, root); -- cgit v1.2.3 From 98114659e0d467e2c0ee6f24f2429329328fc312 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 11:22:38 -0400 Subject: Btrfs: fix race with freeze and free space inodes So we start our freeze, somebody comes in and does an fsync() on a file where we have to commit a transaction for whatever reason, and we will deadlock because the freeze is waiting on FS_FREEZE people to stop writing to the file system, but the transaction is waiting for its free space inodes to be written out, which are in turn waiting on sb_start_intwrite while trying to write the file extents. To fix this we'll just skip the sb_start_intwrite() if we TRANS_JOIN_NOLOCK since we're being waited on by a transaction commit so we're safe wrt to freeze and this will keep us from deadlocking. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c9265a60348..a4fe5494d01 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -342,7 +342,15 @@ again: if (!h) return ERR_PTR(-ENOMEM); - if (!__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) { + /* + * If we are JOIN_NOLOCK we're already committing a transaction and + * waiting on this guy, so we don't need to do the sb_start_intwrite + * because we're already holding a ref. We need this because we could + * have raced in and did an fsync() on a file which can kick a commit + * and then we deadlock with somebody doing a freeze. + */ + if (type != TRANS_JOIN_NOLOCK && + !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) { if (type == TRANS_JOIN_FREEZE) return ERR_PTR(-EPERM); sb_start_intwrite(root->fs_info->sb); @@ -601,7 +609,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - sb_end_intwrite(root->fs_info->sb); + if (lock) + sb_end_intwrite(root->fs_info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); -- cgit v1.2.3 From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 12:59:20 -0400 Subject: Btrfs: do not hold the write_lock on the extent tree while logging Dave Sterba pointed out a sleeping while atomic bug while doing fsync. This is because I'm an idiot and didn't realize that rwlock's were spin locks, so we've been holding this thing while doing allocations and such which is not good. This patch fixes this by dropping the write lock before we do anything heavy and re-acquire it when it is done. We also need to take a ref on the em's in case their corresponding pages are evicted and mark them as being logged so that releasepage does not remove them and doesn't remove them from our local list. Thanks, Reported-by: Dave Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 3 ++- fs/btrfs/extent_map.h | 1 + fs/btrfs/tree-log.c | 21 +++++++++++++++++---- 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8d1364d385d..b8cbc8d5c7f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -407,7 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); - list_del_init(&em->list); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); em->in_tree = 0; return ret; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e6294b5135..679225555f7 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,6 +13,7 @@ #define EXTENT_FLAG_COMPRESSED 1 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 038a5229404..ed1f7ce7219 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2945,6 +2945,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_del_init(&em->list); if (em->generation <= test_gen) continue; + /* Need a ref to keep it from getting evicted from cache */ + atomic_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); } @@ -2954,13 +2957,18 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ - if (ret) + if (ret) { + free_extent_map(em); continue; + } + + write_unlock(&tree->lock); /* * If the previous EM and the last extent we left off on aren't @@ -2971,21 +2979,26 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); - if (ret) + if (ret) { + free_extent_map(em); + write_lock(&tree->lock); continue; + } btrfs_release_path(path); args.nr = 0; } ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + free_extent_map(em); + write_lock(&tree->lock); } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); if (!ret && args.nr) ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); - WARN_ON(!list_empty(&extents)); - write_unlock(&tree->lock); return ret; } -- cgit v1.2.3 From b5bae2612af92fd8e7bcdcf7ce3e0259e8d341c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 13:43:01 -0400 Subject: Btrfs: fix race when getting the eb out of page->private We can race when checking wether PagePrivate is set on a page and we actually have an eb saved in the pages private pointer. We could have easily written out this page and released it in the time that we did the pagevec lookup and actually got around to looking at this page. So use mapping->private_lock to ensure we get a consistent view of the page->private pointer. This is inline with the alloc and releasepage paths which use private_lock when manipulating page->private. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 90bd9f768c0..a2c21570adf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3256,19 +3256,34 @@ retry: break; } + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + continue; + } + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON + * but no sense in crashing the users box for something + * we can survive anyway. + */ if (!eb) { + spin_unlock(&mapping->private_lock); WARN_ON(1); continue; } - if (eb == prev_eb) + if (eb == prev_eb) { + spin_unlock(&mapping->private_lock); continue; + } - if (!atomic_inc_not_zero(&eb->refs)) { - WARN_ON(1); + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) continue; - } prev_eb = eb; ret = lock_extent_buffer_for_io(eb, fs_info, &epd); -- cgit v1.2.3 From 892951a92ebf8b390815b24f9418f297a5081898 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 13:51:53 -0400 Subject: Btrfs: remove unused write cache pages hook The btree inode has it's own write cache pages so we can remove this write cache pages hook as it's not used. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5d3f813e13a..2c18a4eb4d5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3347,52 +3347,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)) -{ - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *eb; - - /* - * We culled this eb but the page is still hanging out on the mapping, - * carry on. - */ - if (!PagePrivate(page)) - goto out; - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - goto out; - } - if (page != eb->pages[0]) - goto out; - - if (!btrfs_try_tree_write_lock(eb)) { - flush_fn(data); - btrfs_tree_lock(eb); - } - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - - btrfs_tree_unlock(eb); -out: - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } - return 0; -} - static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { @@ -3787,7 +3741,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) } static struct extent_io_ops btree_extent_io_ops = { - .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, -- cgit v1.2.3 From 926ced123bd6651b30a07f65a2a8a0b26154cd58 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 13:58:59 -0400 Subject: Btrfs: don't do anything in our ->freeze_fs and ->unfreeze_fs We do not need to do anything special to freeze or unfreeze, it's all taken care of by the generic work, and what we currently have is wrong anyway since we shouldn't be returnning to userspace with mutexes held anyway. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5aa3b8182d9..fb260bbf59c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1511,17 +1511,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_lock(&fs_info->transaction_kthread_mutex); - mutex_lock(&fs_info->cleaner_mutex); return 0; } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_unlock(&fs_info->cleaner_mutex); - mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } -- cgit v1.2.3 From c3308f84c1743eabb91f4976a314d118d5ea2342 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 14:51:22 -0400 Subject: Btrfs: fix punch hole when no extent exists I saw the warning in btrfs_drop_extent_cache where our end is less than our start while running xfstests 68 in a loop. This is because we unconditionally do drop_end = min(end, extent_end) in __btrfs_drop_extents(), even though we may not have found an extent in the range we were looking to drop. So keep track of wether or not we found something, and if we didn't just use our end. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d0fc4c5aaf1..110d3cb7b6f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -609,6 +609,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int ret; int modify_tree = -1; int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int found = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -674,6 +675,7 @@ next_slot: goto next_slot; } + found = 1; search_start = max(key.offset, start); if (recow || !modify_tree) { modify_tree = -1; @@ -829,7 +831,7 @@ next_slot: } if (drop_end) - *drop_end = min(end, extent_end); + *drop_end = found ? min(end, extent_end) : end; btrfs_release_path(path); return ret; } -- cgit v1.2.3 From 7e97b8daf63487c20f78487bd4045f39b0d97cf4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Sep 2012 05:56:55 -0600 Subject: btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 38e3d6ab89d..4d7f4bbf4c9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) int ret; u64 ip_oldflags; unsigned int i_oldflags; + umode_t mode; if (btrfs_root_readonly(root)) return -EROFS; @@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip_oldflags = ip->flags; i_oldflags = inode->i_flags; + mode = inode->i_mode; flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); @@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_DIRSYNC; else ip->flags &= ~BTRFS_INODE_DIRSYNC; - if (flags & FS_NOCOW_FL) - ip->flags |= BTRFS_INODE_NODATACOW; - else - ip->flags &= ~BTRFS_INODE_NODATACOW; + if (flags & FS_NOCOW_FL) { + if (S_ISREG(mode)) { + /* + * It's safe to turn csums off here, no extents exist. + * Otherwise we want the flag to reflect the real COW + * status of the file and will not set it. + */ + if (inode->i_size == 0) + ip->flags |= BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM; + } else { + ip->flags |= BTRFS_INODE_NODATACOW; + } + } else { + /* + * Revert back under same assuptions as above + */ + if (S_ISREG(mode)) { + if (inode->i_size == 0) + ip->flags &= ~(BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM); + } else { + ip->flags &= ~BTRFS_INODE_NODATACOW; + } + } /* * The COMPRESS flag can only be changed by users, while the NOCOMPRESS -- cgit v1.2.3 From aa42ffd918c420d5625b25b7a0bc2bbde4c9f890 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 18 Sep 2012 03:52:23 -0600 Subject: Btrfs: fix off-by-one in file clone Btrfs uses inclusive range end for lock_extent(), unlock_extent() and related functions, so we made off-by-one errors in file clone. This fixes it and also fixes some style problems. Signed-off-by: Liu Bo --- fs/btrfs/ioctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4d7f4bbf4c9..d6836af6d60 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2481,13 +2481,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, another, and lock file content */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len); - ordered = btrfs_lookup_first_ordered_extent(src, off+len); + lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1); if (!ordered && - !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, - EXTENT_DELALLOC, 0, NULL)) + !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1, + EXTENT_DELALLOC, 0, NULL)) break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); btrfs_wait_ordered_range(src, off, len); @@ -2561,7 +2561,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_release_path(path); if (key.offset + datal <= off || - key.offset >= off+len) + key.offset >= off + len - 1) goto next; memcpy(&new_key, &key, sizeof(new_key)); @@ -2662,8 +2662,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset += skip; } - if (key.offset + datal > off+len) - trim = key.offset + datal - (off+len); + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); if (comp && (skip || trim)) { ret = -EINVAL; @@ -2740,7 +2740,7 @@ next: ret = 0; out: btrfs_release_path(path); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 8732d44f806a9da9a7ca4d1704b8a1ed81639bc4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 17 Sep 2012 23:52:38 -0600 Subject: Btrfs: fix the missing error information in create_pending_snapshot() The macro btrfs_abort_transaction() can get the line number of the code where the problem happens, so we should invoke it in the place that the error occurs, or we will lose the line number. Reported-by: David Sterba Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 57 +++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a4fe5494d01..910ff8051ba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1073,7 +1073,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); - goto abort_trans; + btrfs_abort_transaction(trans, root, ret); + goto fail; } btrfs_release_path(path); @@ -1084,8 +1085,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans, root); - if (ret) /* Transaction aborted */ - goto abort_trans; + if (ret) { /* Transaction aborted */ + btrfs_abort_transaction(trans, root, ret); + goto fail; + } record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); @@ -1118,7 +1121,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - goto abort_trans; + btrfs_abort_transaction(trans, root, ret); + goto fail; } btrfs_set_lock_blocking(old); @@ -1127,8 +1131,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); - if (ret) - goto abort_trans; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } /* see comments in should_cow_block() */ root->force_cow = 1; @@ -1140,8 +1146,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - if (ret) - goto abort_trans; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } /* * insert root back/forward references @@ -1150,23 +1158,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); - if (ret) - goto abort_trans; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - goto abort_trans; + btrfs_abort_transaction(trans, root, ret); + goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); - if (ret) - goto abort_trans; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - goto abort_trans; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, @@ -1174,15 +1189,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST); - if (ret) - goto abort_trans; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, parent_root, parent_inode); if (ret) - goto abort_trans; + btrfs_abort_transaction(trans, root, ret); fail: dput(parent); trans->block_rsv = rsv; @@ -1193,10 +1210,6 @@ root_item_alloc_fail: path_alloc_fail: btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; - -abort_trans: - btrfs_abort_transaction(trans, root, ret); - goto fail; } /* -- cgit v1.2.3 From 005d6427ac4f276d937a36ca6a1d62b181ed70bf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Sep 2012 07:52:32 -0600 Subject: btrfs: move transaction aborts to the point of failure Call btrfs_abort_transaction as early as possible when an error condition is detected, that way the line number reported is useful and we're not clueless anymore which error path led to the abort. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/extent-tree.c | 56 +++++++++++++++++++++++++++++++------------------- fs/btrfs/root-tree.c | 29 +++++++++++++++----------- fs/btrfs/volumes.c | 37 ++++++++++++++++++++------------- 4 files changed, 80 insertions(+), 47 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1372057e1ec..399521ab61d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3400,6 +3400,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, } } +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ + #define btrfs_abort_transaction(trans, root, errno) \ do { \ __btrfs_abort_transaction(trans, root, __func__, \ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a3a902fdeb4..395e222e39a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5123,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -5142,8 +5144,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } extent_slot = path->slots[0]; } } else if (ret == -ENOENT) { @@ -5157,7 +5161,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)owner_objectid, (unsigned long long)owner_offset); } else { - goto abort; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } leaf = path->nodes[0]; @@ -5167,8 +5172,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -5185,8 +5192,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -5223,8 +5233,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } } else { if (found_extent) { @@ -5241,27 +5253,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } ret = update_block_group(trans, root, bytenr, num_bytes, 0); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } out: btrfs_free_path(path); return ret; - -abort: - btrfs_abort_transaction(trans, extent_root, ret); - goto out; } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 10d8e4d8807..eb923d087da 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); @@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_release_path(path); ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + ret = btrfs_del_item(trans, root, path); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - if (ret < 0) - goto out_abort; + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr_offset(l, slot); @@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root out: btrfs_free_path(path); return ret; - -out_abort: - btrfs_abort_transaction(trans, root, ret); - goto out; } int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 88b969aeeb7..4f4de51b5bd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1775,15 +1775,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { ret = init_first_rw_device(trans, root, device); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto error_trans; + } ret = btrfs_finish_sprout(trans, root); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto error_trans; + } } else { ret = btrfs_add_device(trans, root, device); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto error_trans; + } } /* @@ -1814,7 +1820,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: unlock_chunks(root); - btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); rcu_string_free(device->name); kfree(device); @@ -3608,12 +3613,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } /* * Modifying chunk tree needs allocating new blocks from both @@ -3623,19 +3632,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, */ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, chunk_size, stripe_size); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } ret = __finish_chunk_alloc(trans, extent_root, sys_map, sys_chunk_offset, sys_chunk_size, sys_stripe_size); if (ret) - goto abort; + btrfs_abort_transaction(trans, root, ret); - return 0; +out: -abort: - btrfs_abort_transaction(trans, root, ret); return ret; } -- cgit v1.2.3 From 0aa4a17d8232e7d8a42763b53bb9943bc0484b18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Sep 2012 15:42:38 -0400 Subject: Btrfs: handle not finding the extent exactly when logging changed extents I started hitting warnings when running xfstest 68 in a loop because there were EM's that were not lined up properly with the physical extents. This is ok, if we do something like punch a hole or write to a preallocated space or something like that we can have an EM that doesn't cover the entire physical extent. So fix the tree logging stuff to cope with this case so we don't just commit the transaction. With this patch I no longer see the warnings from the tree logging code. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ed1f7ce7219..e0ef92f91d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2833,6 +2833,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 start = em->mod_start; + u64 search_start = start; u64 len = em->mod_len; u64 num_bytes; int nritems; @@ -2848,23 +2849,55 @@ static int log_one_extent(struct btrfs_trans_handle *trans, while (len) { if (args->nr) goto next_slot; +again: key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; + key.offset = search_start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; + if (ret) { /* - * This shouldn't happen, but it might so warn and - * return an error. + * A rare case were we can have an em for a section of a + * larger extent so we need to make sure that this em + * falls within the extent we've found. If not we just + * bail and go back to ye-olde way of doing things but + * it happens often enough in testing that we need to do + * this dance to make sure. */ - WARN_ON(1); - return -ENOENT; + do { + if (path->slots[0] == 0) { + btrfs_release_path(path); + if (search_start == 0) + return -ENOENT; + search_start--; + goto again; + } + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY) { + btrfs_release_path(path); + return -ENOENT; + } + } while (key.offset > start); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], + fi); + if (key.offset + num_bytes <= start) { + btrfs_release_path(path); + return -ENOENT; + } } args->src = path->nodes[0]; next_slot: + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); fi = btrfs_item_ptr(args->src, path->slots[0], struct btrfs_file_extent_item); if (args->nr && @@ -2898,8 +2931,9 @@ next_slot: } else { len -= num_bytes; } - start += btrfs_file_extent_num_bytes(args->src, fi); + start = key.offset + num_bytes; args->next_offset = start; + search_start = start; if (path->slots[0] < nritems) { if (len) -- cgit v1.2.3 From 5a1d7843ca4b3a9009bea87f85ad33854b910aea Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 17 Aug 2012 14:04:41 -0700 Subject: btrfs: improved readablity for add_inode_ref Moved part of the code into a sub function and replaced most of the gotos by ifs, hoping that it will be easier to read now. Signed-off-by: Jan Schmidt Signed-off-by: Mark Fasheh --- fs/btrfs/tree-log.c | 178 ++++++++++++++++++++++++++++------------------------ 1 file changed, 97 insertions(+), 81 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e0ef92f91d6..15dae589e59 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -785,76 +785,18 @@ out: return match; } - -/* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). - */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, +static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_root *log, struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) + struct btrfs_root *log_root, + struct inode *dir, struct inode *inode, + struct btrfs_key *key, + struct extent_buffer *eb, + struct btrfs_inode_ref *ref, + char *name, int namelen, int *search_done) { - struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; - struct inode *dir; - struct inode *inode; - unsigned long ref_ptr; - unsigned long ref_end; - char *name; - int namelen; int ret; - int search_done = 0; - - /* - * it is possible that we didn't log all the parent directories - * for a given inode. If we don't find the dir, just don't - * copy the back ref in. The link count fixup code will take - * care of the rest - */ - dir = read_one_inode(root, key->offset); - if (!dir) - return -ENOENT; - - inode = read_one_inode(root, key->objectid); - if (!inode) { - iput(dir); - return -EIO; - } - - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); - -again: - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); - - /* if we already have a perfect match, we're done */ - if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - btrfs_inode_ref_index(eb, ref), - name, namelen)) { - goto out; - } - - /* - * look for a conflicting back reference in the metadata. - * if we find one we have to unlink that name of the file - * before we add our new link. Later on, we overwrite any - * existing back reference, and we don't want to create - * dangling pointers in the directory. - */ - - if (search_done) - goto insert; + struct btrfs_dir_item *di; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret == 0) { @@ -869,7 +811,7 @@ again: * if so, just jump out, we're done */ if (key->objectid == key->offset) - goto out_nowrite; + return 1; /* check all the names in this back reference to see * if they are in the log. if so, we allow them to stay @@ -888,7 +830,7 @@ again: (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log, key, victim_name, + if (!backref_in_log(log_root, key, victim_name, victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(path); @@ -907,7 +849,7 @@ again: * NOTE: we have searched root tree and checked the * coresponding ref, it does not need to check again. */ - search_done = 1; + *search_done = 1; } btrfs_release_path(path); @@ -930,25 +872,99 @@ again: } btrfs_release_path(path); -insert: - /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, - btrfs_inode_ref_index(eb, ref)); - BUG_ON(ret); + return 0; +} + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct btrfs_inode_ref *ref; + struct inode *dir; + struct inode *inode; + unsigned long ref_ptr; + unsigned long ref_end; + char *name; + int namelen; + int ret; + int search_done = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; - btrfs_update_inode(trans, root, inode); + inode = read_one_inode(root, key->objectid); + if (!inode) { + iput(dir); + return -EIO; + } -out: - ref_ptr = (unsigned long)(ref + 1) + namelen; - kfree(name); - if (ref_ptr < ref_end) - goto again; + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + while (ref_ptr < ref_end) { + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + /* + * look for a conflicting back reference in the + * metadata. if we find one we have to unlink that name + * of the file before we add our new link. Later on, we + * overwrite any existing back reference, and we don't + * want to create dangling pointers in the directory. + */ + + if (!search_done) { + ret = __add_inode_ref(trans, root, path, log, + dir, inode, key, eb, ref, + name, namelen, + &search_done); + if (ret == 1) + goto out; + BUG_ON(ret); + } + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, + 0, btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + } + + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + } /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); BUG_ON(ret); -out_nowrite: +out: btrfs_release_path(path); iput(dir); iput(inode); -- cgit v1.2.3 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285c6e4..f6b40e86121 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1599,6 +1599,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) file_accessed(filp); vma->vm_ops = &btrfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } -- cgit v1.2.3 From f186373fef005cee948a4a39e6a14c2e5f517298 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 8 Aug 2012 11:32:27 -0700 Subject: btrfs: extended inode refs This patch adds basic support for extended inode refs. This includes support for link and unlink of the refs, which basically gets us support for rename as well. Inode creation does not need changing - extended refs are only added after the ref array is full. Signed-off-by: Mark Fasheh --- fs/btrfs/backref.c | 68 ++++++++++ fs/btrfs/backref.h | 5 + fs/btrfs/ctree.h | 53 +++++++- fs/btrfs/hash.h | 10 ++ fs/btrfs/inode-item.c | 285 +++++++++++++++++++++++++++++++++++++++-- fs/btrfs/inode.c | 23 ++-- fs/btrfs/tree-log.c | 345 ++++++++++++++++++++++++++++++++++++++++++-------- 7 files changed, 710 insertions(+), 79 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7084431b7c9..dc963121d1d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1109,6 +1109,74 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, found_key); } +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off) +{ + int ret, slot; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + unsigned long ptr; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = start_off; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If the item at offset is not found, + * btrfs_search_slot will point us to the slot + * where it should be inserted. In our case + * that will be the slot directly before the + * next INODE_REF_KEY_V2 item. In the case + * that we're pointing to the last slot in a + * leaf, we must move one leaf over. + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret >= 1) + ret = -ENOENT; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* + * Check that we're still looking at an extended ref key for + * this particular objectid. If we have different + * objectid or type then there are no more to be found + * in the tree and we can exit. + */ + ret = -ENOENT; + if (found_key.objectid != inode_objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + break; + + ret = 0; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + extref = (struct btrfs_inode_extref *)ptr; + *ret_extref = extref; + if (found_off) + *found_off = found_key.offset; + break; + } + + return ret; +} + /* * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements * of the path are separated by '/' and the path is guaranteed to be diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 4fda5d8029a..0b920c11395 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -70,4 +70,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); void free_ipath(struct inode_fs_paths *ipath); +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off); + #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 399521ab61d..50dcd0fbae1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -154,6 +154,13 @@ struct btrfs_ordered_sum; */ #define BTRFS_NAME_LEN 255 +/* + * Theoretical limit is larger, but we keep this down to a sane + * value. That should limit greatly the possibility of collisions on + * inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + /* 32 bytes in various csum fields */ #define BTRFS_CSUM_SIZE 32 @@ -489,6 +496,8 @@ struct btrfs_super_block { */ #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) + #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ @@ -496,7 +505,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) /* * A leaf is full of items. offset and size tell us where to find @@ -643,6 +653,14 @@ struct btrfs_inode_ref { /* name goes here */ } __attribute__ ((__packed__)); +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[0]; + /* name goes here */ +} __attribute__ ((__packed__)); + struct btrfs_timespec { __le64 sec; __le32 nsec; @@ -1601,6 +1619,7 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_INODE_ITEM_KEY 1 #define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 #define BTRFS_XATTR_ITEM_KEY 24 #define BTRFS_ORPHAN_ITEM_KEY 48 /* reserve 2-15 close to the inode for later flexibility */ @@ -1987,6 +2006,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags, BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); @@ -3184,12 +3210,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); -struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod); +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod, + u64 *ret_index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3197,6 +3223,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, + u64 ref_objectid, const char *name, + int name_len, + struct btrfs_inode_extref **extref_ret); + /* file-item.c */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index db2ff9773b9..1d982812ab6 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); } + +/* + * Figure the key offset of an extended inode ref + */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, + int len) +{ + return (u64) crc32c(parent_objectid, name, len); +} + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a13cf1a96c7..48b8fda9313 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -18,6 +18,7 @@ #include "ctree.h" #include "disk-io.h" +#include "hash.h" #include "transaction.h" #include "print-tree.h" @@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, return 0; } -struct btrfs_inode_ref * +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, + const char *name, int name_len, + struct btrfs_inode_extref **extref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_extref *extref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + /* + * Search all extended backrefs in this item. We're only + * looking through any collisions so most of the time this is + * just going to compare against one buffer. If all is well, + * we'll return success and the inode ref object. + */ + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_ptr = (unsigned long)(&extref->name); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (ref_name_len == name_len && + btrfs_inode_extref_parent(leaf, extref) == ref_objectid && + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { + if (extref_ret) + *extref_ret = extref; + return 1; + } + + cur_offset += ref_name_len + sizeof(*extref); + } + return 0; +} + +static struct btrfs_inode_ref * btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod) + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) { + int ret; struct btrfs_key key; struct btrfs_inode_ref *ref; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - int ret; key.objectid = inode_objectid; key.type = BTRFS_INODE_REF_KEY; @@ -77,10 +117,147 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, return ref; } -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, +/* Returns NULL if no extref found */ +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow) +{ + int ret; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + return NULL; + return extref; +} + +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod, + u64 *ret_index) +{ + struct btrfs_inode_ref *ref; + struct btrfs_inode_extref *extref; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len, + inode_objectid, ref_objectid, ins_len, + cow); + if (IS_ERR(ref)) + return PTR_ERR(ref); + + if (ref != NULL) { + *ret_index = btrfs_inode_ref_index(path->nodes[0], ref); + return 0; + } + + btrfs_release_path(path); + + extref = btrfs_lookup_inode_extref(trans, root, path, name, + name_len, inode_objectid, + ref_objectid, ins_len, cow); + if (IS_ERR(extref)) + return PTR_ERR(extref); + + if (extref) { + *ret_index = btrfs_inode_extref_index(path->nodes[0], extref); + return 0; + } + + return -ENOENT; +} + +int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + int ret; + int del_len = name_len + sizeof(*extref); + unsigned long ptr; + unsigned long item_start; + u32 item_size; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; + + /* + * Sanity check - did we find the right item for this name? + * This should always succeed so error here will make the FS + * readonly. + */ + if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, &extref)) { + btrfs_std_error(root->fs_info, -ENOENT); + ret = -EROFS; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (index) + *index = btrfs_inode_extref_index(leaf, extref); + + if (del_len == item_size) { + /* + * Common case only one ref in the item, remove the + * whole item. + */ + ret = btrfs_del_item(trans, root, path); + goto out; + } + + ptr = (unsigned long)extref; + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + + memmove_extent_buffer(leaf, ptr, ptr + del_len, + item_size - (ptr + del_len - item_start)); + + btrfs_truncate_item(trans, root, path, item_size - del_len, 1); + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; struct btrfs_key key; @@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 item_size; u32 sub_item_len; int ret; + int search_ext_refs = 0; int del_len = name_len + sizeof(*ref); key.objectid = inode_objectid; @@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; + search_ext_refs = 1; goto out; } else if (ret < 0) { goto out; } if (!find_name_in_backref(path, name, name_len, &ref)) { ret = -ENOENT; + search_ext_refs = 1; goto out; } leaf = path->nodes[0]; @@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(trans, root, path, - item_size - sub_item_len, 1); + btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); +out: + btrfs_free_path(path); + + if (search_ext_refs) { + /* + * No refs were found, or we could not find the + * name in our ref array. Find and remove the extended + * inode ref then. + */ + return btrfs_del_inode_extref(trans, root, name, name_len, + inode_objectid, ref_objectid, index); + } + + return ret; +} + +/* + * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * + * The caller must have checked against BTRFS_LINK_MAX already. + */ +static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_inode_extref *extref; + int ret; + int ins_len = name_len + sizeof(*extref); + unsigned long ptr; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_item *item; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, name_len, NULL)) + goto out; + + btrfs_extend_item(trans, root, path, ins_len); + ret = 0; + } + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); + ptr += btrfs_item_size(leaf, item) - ins_len; + extref = (struct btrfs_inode_extref *)ptr; + + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_index(path->nodes[0], extref, index); + btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); + + ptr = (unsigned long)&extref->name; + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + out: btrfs_free_path(path); return ret; @@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); + + if (ret == -EMLINK) { + struct btrfs_super_block *disk_super = root->fs_info->super_copy; + /* We ran out of space in the ref array. Need to + * add an extended ref. */ + if (btrfs_super_incompat_flags(disk_super) + & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + ret = btrfs_insert_inode_extref(trans, root, name, + name_len, + inode_objectid, + ref_objectid, index); + } + return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1c50f7c4f5a..596305e4d75 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2903,7 +2903,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; - struct btrfs_inode_ref *ref; struct btrfs_dir_item *di; struct inode *inode = dentry->d_inode; u64 index; @@ -3017,17 +3016,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, } btrfs_release_path(path); - ref = btrfs_lookup_inode_ref(trans, root, path, - dentry->d_name.name, dentry->d_name.len, - ino, dir_ino, 0); - if (IS_ERR(ref)) { - err = PTR_ERR(ref); + ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, + dentry->d_name.len, ino, dir_ino, 0, + &index); + if (ret) { + err = ret; goto out; } - BUG_ON(!ref); /* Logic error */ + if (check_path_shared(root, path)) goto out; - index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(path); /* @@ -4743,6 +4742,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ key[1].objectid = objectid; btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); key[1].offset = ref_objectid; @@ -5049,7 +5054,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (root->objectid != BTRFS_I(inode)->root->objectid) return -EXDEV; - if (inode->i_nlink == ~0U) + if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; err = btrfs_set_inode_index(dir, &index); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 15dae589e59..1d7b3484432 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -24,8 +24,10 @@ #include "disk-io.h" #include "locking.h" #include "print-tree.h" +#include "backref.h" #include "compat.h" #include "tree-log.h" +#include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -743,6 +745,7 @@ out: */ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, + u64 ref_objectid, char *name, int namelen) { struct btrfs_path *path; @@ -763,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log, if (ret != 0) goto out; - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (btrfs_find_name_in_ext_backref(path, ref_objectid, + name, namelen, NULL)) + match = 1; + + goto out; + } + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); ptr_end = ptr + item_size; while (ptr < ptr_end) { ref = (struct btrfs_inode_ref *)ptr; @@ -790,27 +802,36 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_root *log_root, struct inode *dir, struct inode *inode, - struct btrfs_key *key, struct extent_buffer *eb, - struct btrfs_inode_ref *ref, - char *name, int namelen, int *search_done) + u64 inode_objectid, u64 parent_objectid, + u64 ref_index, char *name, int namelen, + int *search_done) { int ret; + char *victim_name; + int victim_name_len; + struct extent_buffer *leaf; struct btrfs_dir_item *di; + struct btrfs_key search_key; + struct btrfs_inode_extref *extref; - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); +again: + /* Search old style refs */ + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = parent_objectid; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret == 0) { - char *victim_name; - int victim_name_len; struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; - struct extent_buffer *leaf = path->nodes[0]; + + leaf = path->nodes[0]; /* are we trying to overwrite a back ref for the root directory * if so, just jump out, we're done */ - if (key->objectid == key->offset) + if (search_key.objectid == search_key.offset) return 1; /* check all the names in this back reference to see @@ -830,7 +851,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log_root, key, victim_name, + if (!backref_in_log(log_root, &search_key, + parent_objectid, + victim_name, victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(path); @@ -838,9 +861,14 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); + BUG_ON(ret); btrfs_run_delayed_items(trans, root); + kfree(victim_name); + *search_done = 1; + goto again; } kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } BUG_ON(ret); @@ -853,10 +881,74 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, } btrfs_release_path(path); + /* Same search but for extended refs */ + extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + inode_objectid, parent_objectid, 0, + 0); + if (!IS_ERR_OR_NULL(extref)) { + u32 item_size; + u32 cur_offset = 0; + unsigned long base; + struct inode *victim_parent; + + leaf = path->nodes[0]; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + base = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)base + cur_offset; + + victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + victim_name = kmalloc(victim_name_len, GFP_NOFS); + read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, + victim_name_len); + + search_key.objectid = inode_objectid; + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(parent_objectid, + victim_name, + victim_name_len); + ret = 0; + if (!backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len)) { + ret = -ENOENT; + victim_parent = read_one_inode(root, + parent_objectid); + if (victim_parent) { + btrfs_inc_nlink(inode); + btrfs_release_path(path); + + ret = btrfs_unlink_inode(trans, root, + victim_parent, + inode, + victim_name, + victim_name_len); + btrfs_run_delayed_items(trans, root); + } + BUG_ON(ret); + iput(victim_parent); + kfree(victim_name); + *search_done = 1; + goto again; + } + kfree(victim_name); + BUG_ON(ret); +next: + cur_offset += victim_name_len + sizeof(*extref); + } + *search_done = 1; + } + btrfs_release_path(path); + /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); + ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); BUG_ON(ret); @@ -875,6 +967,48 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, return 0; } +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) +{ + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)ref_ptr; + + *namelen = btrfs_inode_extref_name_len(eb, extref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)&extref->name, + *namelen); + + *index = btrfs_inode_extref_index(eb, extref); + if (parent_objectid) + *parent_objectid = btrfs_inode_extref_parent(eb, extref); + + return 0; +} + +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) +{ + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ref_ptr; + + *namelen = btrfs_inode_ref_name_len(eb, ref); + *name = kmalloc(*namelen, GFP_NOFS); + if (*name == NULL) + return -ENOMEM; + + read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + + *index = btrfs_inode_ref_index(eb, ref); + + return 0; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -888,7 +1022,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_inode_ref *ref; struct inode *dir; struct inode *inode; unsigned long ref_ptr; @@ -897,6 +1030,27 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int namelen; int ret; int search_done = 0; + int log_ref_ver = 0; + u64 parent_objectid; + u64 inode_objectid; + u64 ref_index; + int ref_struct_size; + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *r; + + ref_struct_size = sizeof(struct btrfs_inode_extref); + log_ref_ver = 1; + r = (struct btrfs_inode_extref *)ref_ptr; + parent_objectid = btrfs_inode_extref_parent(eb, r); + } else { + ref_struct_size = sizeof(struct btrfs_inode_ref); + parent_objectid = key->offset; + } + inode_objectid = key->objectid; /* * it is possible that we didn't log all the parent directories @@ -904,32 +1058,38 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, key->offset); + dir = read_one_inode(root, parent_objectid); if (!dir) return -ENOENT; - inode = read_one_inode(root, key->objectid); + inode = read_one_inode(root, inode_objectid); if (!inode) { iput(dir); return -EIO; } - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); - while (ref_ptr < ref_end) { - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); + /* + * parent object can change from one array + * item to another. + */ + if (!dir) + dir = read_one_inode(root, parent_objectid); + if (!dir) + return -ENOENT; + } else { + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); + } + if (ret) + return ret; /* if we already have a perfect match, we're done */ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - btrfs_inode_ref_index(eb, ref), - name, namelen)) { + ref_index, name, namelen)) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -940,8 +1100,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (!search_done) { ret = __add_inode_ref(trans, root, path, log, - dir, inode, key, eb, ref, - name, namelen, + dir, inode, eb, + inode_objectid, + parent_objectid, + ref_index, name, namelen, &search_done); if (ret == 1) goto out; @@ -950,14 +1112,18 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, - 0, btrfs_inode_ref_index(eb, ref)); + 0, ref_index); BUG_ON(ret); btrfs_update_inode(trans, root, inode); } - ref_ptr = (unsigned long)(ref + 1) + namelen; + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); + if (log_ref_ver) { + iput(dir); + dir = NULL; + } } /* finally write the back reference in the inode */ @@ -981,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans, return ret; } +static int count_inode_extrefs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) +{ + int ret = 0; + int name_len; + unsigned int nlink = 0; + u32 item_size; + u32 cur_offset = 0; + u64 inode_objectid = btrfs_ino(inode); + u64 offset = 0; + unsigned long ptr; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; -/* - * There are a few corners where the link count of the file can't - * be properly maintained during replay. So, instead of adding - * lots of complexity to the log code, we just scan the backrefs - * for any file that has been through replay. - * - * The scan will update the link count on the inode to reflect the - * number of back refs found. If it goes down to zero, the iput - * will free the inode. - */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) + while (1) { + ret = btrfs_find_one_extref(root, inode_objectid, offset, path, + &extref, &offset); + if (ret) + break; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *) (ptr + cur_offset); + name_len = btrfs_inode_extref_name_len(leaf, extref); + + nlink++; + + cur_offset += name_len + sizeof(*extref); + } + + offset++; + btrfs_release_path(path); + } + btrfs_release_path(path); + + if (ret < 0) + return ret; + return nlink; +} + +static int count_inode_refs(struct btrfs_root *root, + struct inode *inode, struct btrfs_path *path) { - struct btrfs_path *path; int ret; struct btrfs_key key; - u64 nlink = 0; + unsigned int nlink = 0; unsigned long ptr; unsigned long ptr_end; int name_len; @@ -1009,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1046,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, btrfs_release_path(path); } btrfs_release_path(path); + + return nlink; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + u64 nlink = 0; + u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = count_inode_refs(root, inode, path); + if (ret < 0) + goto out; + + nlink = ret; + + ret = count_inode_extrefs(root, inode, path); + if (ret == -ENOENT) + ret = 0; + + if (ret < 0) + goto out; + + nlink += ret; + + ret = 0; + if (nlink != inode->i_nlink) { set_nlink(inode, nlink); btrfs_update_inode(trans, root, inode); @@ -1061,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = insert_orphan_item(trans, root, ino); BUG_ON(ret); } - btrfs_free_path(path); - return 0; +out: + btrfs_free_path(path); + return ret; } static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, @@ -1710,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); -- cgit v1.2.3 From d24bec3ae528a47149b838aad76c006d40fe8a39 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 8 Aug 2012 11:33:54 -0700 Subject: btrfs: extended inode ref iteration The iterate_irefs in backref.c is used to build path components from inode refs. This patch adds code to iterate extended refs as well. I had modify the callback function signature to abstract out some of the differences between ref structures. iref_to_path() also needed similar changes. Signed-off-by: Mark Fasheh --- fs/btrfs/backref.c | 173 ++++++++++++++++++++++++++++++++++++++++++----------- fs/btrfs/backref.h | 2 - 2 files changed, 138 insertions(+), 37 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index dc963121d1d..f3187938e08 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1177,26 +1177,12 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, return ret; } -/* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, +static char *ref_to_path(struct btrfs_root *fs_root, + struct btrfs_path *path, + u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, char *dest, u32 size) { - u32 len; int slot; u64 next_inum; int ret; @@ -1204,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct extent_buffer *eb = eb_in; struct btrfs_key found_key; int leave_spinning = path->leave_spinning; + struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; path->leave_spinning = 1; while (1) { - len = btrfs_inode_ref_name_len(eb, iref); - bytes_left -= len; + bytes_left -= name_len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, - (unsigned long)(iref + 1), len); + name_off, name_len); if (eb != eb_in) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1224,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, ret = -ENOENT; if (ret) break; + next_inum = found_key.offset; /* regular exit ahead */ @@ -1239,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); } btrfs_release_path(path); - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + parent = next_inum; --bytes_left; if (bytes_left >= 0) @@ -1256,6 +1246,32 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, return dest + bytes_left; } +/* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_iref_to_path(struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + return ref_to_path(fs_root, path, + btrfs_inode_ref_name_len(eb_in, iref), + (unsigned long)(iref + 1), + eb_in, parent, dest, size); +} + /* * this makes the path point to (logical EXTENT_ITEM *) * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for @@ -1529,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx); + +static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) { int ret = 0; int slot; @@ -1548,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, while (!ret) { path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + &found_key); if (ret < 0) break; if (ret) { @@ -1576,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, "tree %llu\n", cur, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); - ret = iterate(parent, iref, eb, ctx); + ret = iterate(parent, name_len, + (unsigned long)(iref + 1), eb, ctx); if (ret) break; len = sizeof(*iref) + name_len; @@ -1591,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, return ret; } +static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u64 offset = 0; + u64 parent; + int found = 0; + struct extent_buffer *eb; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u32 item_size; + u32 cur_offset; + unsigned long ptr; + + while (1) { + ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, + &offset); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; + + while (cur_offset < item_size) { + u32 name_len; + + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + name_len = btrfs_inode_extref_name_len(eb, extref); + ret = iterate(parent, name_len, + (unsigned long)&extref->name, eb, ctx); + if (ret) + break; + + cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += sizeof(*extref); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + offset++; + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, iterate_irefs_t *iterate, + void *ctx) +{ + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; +} + /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ -static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx) +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx) { struct inode_fs_paths *ipath = ctx; char *fspath; @@ -1609,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, - inum, fspath_min, bytes_left); + fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, + bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); if (fspath > fspath_min) { - pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { - pr_debug("missed path, not enough space. missing bytes: %lu, " - "constructed so far: %s\n", - (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; @@ -1644,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + inode_to_path, ipath); } struct btrfs_data_container *init_data_container(u32 total_bytes) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 0b920c11395..e75533043a5 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -33,8 +33,6 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx); int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path); -- cgit v1.2.3 From e8830e606ffee383f073e32313f11fc5692813fe Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Sep 2012 22:14:29 -0600 Subject: Btrfs: fix memory leak in start_transaction() This patch fixes memory leak of the transaction handle which happened when starting transaction failed on a freezed fs. Signed-off-by: Miao Xie --- fs/btrfs/transaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 910ff8051ba..35489644c24 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -351,8 +351,10 @@ again: */ if (type != TRANS_JOIN_NOLOCK && !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) { - if (type == TRANS_JOIN_FREEZE) + if (type == TRANS_JOIN_FREEZE) { + kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(-EPERM); + } sb_start_intwrite(root->fs_info->sb); } -- cgit v1.2.3 From a698d0755adb6f27289d1e6610b2240595d27e8c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Sep 2012 01:51:59 -0600 Subject: Btrfs: add a type field for the transaction handle This patch add a type field into the transaction handle structure, in this way, we needn't implement various end-transaction functions and can make the code more simple and readable. Signed-off-by: Miao Xie --- fs/btrfs/inode.c | 19 ++++--------------- fs/btrfs/transaction.c | 29 ++++++----------------------- fs/btrfs/transaction.h | 15 +++++++++++---- 3 files changed, 21 insertions(+), 42 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 596305e4d75..e355eb0aea1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1363,11 +1363,7 @@ out_check: } error: - if (nolock) { - err = btrfs_end_transaction_nolock(trans, root); - } else { - err = btrfs_end_transaction(trans, root); - } + err = btrfs_end_transaction(trans, root); if (!ret) ret = err; @@ -1957,12 +1953,8 @@ out_unlock: out: if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) { - if (nolock) - btrfs_end_transaction_nolock(trans, root); - else - btrfs_end_transaction(trans, root); - } + if (trans) + btrfs_end_transaction(trans, root); if (ret) clear_extent_uptodate(io_tree, ordered_extent->file_offset, @@ -4524,10 +4516,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - if (nolock) - ret = btrfs_end_transaction_nolock(trans, root); - else - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); } return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 35489644c24..d0a2b7e4938 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -267,14 +267,6 @@ static void wait_current_trans(struct btrfs_root *root) } } -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, - TRANS_JOIN_NOLOCK, - TRANS_JOIN_FREEZE, -}; - static int may_wait_transaction(struct btrfs_root *root, int type) { if (root->fs_info->log_root_recovering) @@ -388,6 +380,7 @@ again: h->aborted = 0; h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; + h->type = type; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -540,11 +533,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int throttle, int lock) + struct btrfs_root *root, int throttle) { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; + int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; if (--trans->use_count) { @@ -645,7 +639,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, { int ret; - ret = __btrfs_end_transaction(trans, root, 0, 1); + ret = __btrfs_end_transaction(trans, root, 0); if (ret) return ret; return 0; @@ -656,18 +650,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, { int ret; - ret = __btrfs_end_transaction(trans, root, 1, 1); - if (ret) - return ret; - return 0; -} - -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = __btrfs_end_transaction(trans, root, 0, 0); + ret = __btrfs_end_transaction(trans, root, 1); if (ret) return ret; return 0; @@ -676,7 +659,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return __btrfs_end_transaction(trans, root, 1, 1); + return __btrfs_end_transaction(trans, root, 1); } /* diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fbf8313b9d6..0630bd19396 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,14 @@ struct btrfs_transaction { int aborted; }; +enum btrfs_trans_type { + TRANS_START, + TRANS_JOIN, + TRANS_USERSPACE, + TRANS_JOIN_NOLOCK, + TRANS_JOIN_FREEZE, +}; + struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; @@ -58,8 +66,9 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; - int aborted; - int adding_csums; + short aborted; + short adding_csums; + enum btrfs_trans_type type; /* * this root is only needed to validate that the root passed to * start_transaction is the same as the one passed to end_transaction. @@ -94,8 +103,6 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, - struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_start_transaction_noflush( -- cgit v1.2.3 From 354aa0fb6d5b97b262e056f7ad7bfc88d7ce0004 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Sep 2012 01:54:00 -0600 Subject: Btrfs: fix orphan transaction on the freezed filesystem With the following debug patch: static int btrfs_freeze(struct super_block *sb) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_transaction *trans; + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) { + printk("Transid %llu, use_count %d, num_writer %d\n", + trans->transid, atomic_read(&trans->use_count), + atomic_read(&trans->num_writers)); + } + spin_unlock(&fs_info->trans_lock); return 0; } I found there was a orphan transaction after the freeze operation was done. It is because the transaction may not be committed when the transaction handle end even though it is the last handle of the current transaction. This design avoid committing the transaction frequently, but also introduce the above problem. So I add btrfs_attach_transaction() which can catch the current transaction and commit it. If there is no transaction, it will return ENOENT, and do not anything. This function also can be used to instead of btrfs_join_transaction_freeze() because it don't increase the writer counter and don't start a new transaction, so it also can fix the deadlock between sync and freeze. Besides that, it is used to instead of btrfs_join_transaction() in transaction_kthread(), because if there is no transaction, the transaction kthread needn't anything. Signed-off-by: Miao Xie --- fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/super.c | 18 ++++++++++++++---- fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++++--------------- fs/btrfs/transaction.h | 4 ++-- 4 files changed, 49 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c18a4eb4d5..dcaf55695e6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1657,9 +1657,10 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - cannot_commit = true; + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; goto sleep; } if (transid == trans->transid) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fb260bbf59c..f8b803f30ed 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -854,10 +854,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0); - trans = btrfs_join_transaction_freeze(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - /* Frozen, don't bother */ - if (PTR_ERR(trans) == -EPERM) + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) return 0; return PTR_ERR(trans); } @@ -1511,7 +1511,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - return 0; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + /* no transaction, don't bother */ + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + return btrfs_commit_transaction(trans, root); } static int btrfs_unfreeze(struct super_block *sb) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d0a2b7e4938..69139a356f7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root) /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int nofail) +static noinline int join_transaction(struct btrfs_root *root, int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -67,7 +67,13 @@ loop: } if (fs_info->trans_no_join) { - if (!nofail) { + /* + * If we are JOIN_NOLOCK we're already committing a current + * transaction, we just need a handle to deal with something + * when committing the transaction, such as inode cache and + * space cache. It is a special case. + */ + if (type != TRANS_JOIN_NOLOCK) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } @@ -87,6 +93,13 @@ loop: } spin_unlock(&fs_info->trans_lock); + /* + * If we are ATTACH, we just want to catch the current transaction, + * and commit it. If there is no transaction, just return ENOENT. + */ + if (type == TRANS_ATTACH) + return -ENOENT; + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -340,27 +353,28 @@ again: * because we're already holding a ref. We need this because we could * have raced in and did an fsync() on a file which can kick a commit * and then we deadlock with somebody doing a freeze. + * + * If we are ATTACH, it means we just want to catch the current + * transaction and commit it, so we needn't do sb_start_intwrite(). */ - if (type != TRANS_JOIN_NOLOCK && - !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) { - if (type == TRANS_JOIN_FREEZE) { - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(-EPERM); - } + if (type < TRANS_JOIN_NOLOCK) sb_start_intwrite(root->fs_info->sb); - } if (may_wait_transaction(root, type)) wait_current_trans(root); do { - ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); + ret = join_transaction(root, type); if (ret == -EBUSY) wait_current_trans(root); } while (ret == -EBUSY); if (ret < 0) { - sb_end_intwrite(root->fs_info->sb); + /* We must get the transaction if we are JOIN_NOLOCK. */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(ret); } @@ -432,9 +446,9 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } -struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root) +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0); + return start_transaction(root, 0, TRANS_ATTACH, 0); } /* wait for a transaction commit to be fully complete */ @@ -605,7 +619,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - if (lock) + if (trans->type < TRANS_JOIN_NOLOCK) sb_end_intwrite(root->fs_info->sb); WARN_ON(cur_trans != info->running_transaction); @@ -1678,7 +1692,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - sb_end_intwrite(root->fs_info->sb); + if (trans->type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); trace_btrfs_transaction_commit(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0630bd19396..80961947a6b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -52,7 +52,7 @@ enum btrfs_trans_type { TRANS_JOIN, TRANS_USERSPACE, TRANS_JOIN_NOLOCK, - TRANS_JOIN_FREEZE, + TRANS_ATTACH, }; struct btrfs_trans_handle { @@ -109,7 +109,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush( struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 67b0fd63d5466bb710677afd3d2b77926c55f662 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Sep 2012 13:42:00 -0400 Subject: Btrfs: run delayed refs first when out of space Running delayed refs is faster than running delalloc, so lets do that first to try and reclaim space. This makes my fs_mark test about 20% faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 395e222e39a..3270b108785 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3817,10 +3817,10 @@ commit: } enum flush_state { - FLUSH_DELALLOC = 1, - FLUSH_DELALLOC_WAIT = 2, - FLUSH_DELAYED_ITEMS_NR = 3, - FLUSH_DELAYED_ITEMS = 4, + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, ALLOC_CHUNK = 5, COMMIT_TRANS = 6, }; @@ -3834,11 +3834,6 @@ static int flush_space(struct btrfs_root *root, int ret = 0; switch (state) { - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes, orig_bytes, - state == FLUSH_DELALLOC_WAIT); - break; case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: if (state == FLUSH_DELAYED_ITEMS_NR) { @@ -3859,6 +3854,11 @@ static int flush_space(struct btrfs_root *root, ret = btrfs_run_delayed_items_nr(trans, root, nr); btrfs_end_transaction(trans, root); break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; case ALLOC_CHUNK: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3903,7 +3903,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELALLOC; + int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; bool flushing = false; bool committed = false; -- cgit v1.2.3 From 221b831835421f9451182611fa25fa60f440662f Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 20 Sep 2012 14:33:00 -0600 Subject: btrfs: fix min csum item size warnings in 32bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7ca4be45a0255ac8f08c05491c6add2dd87dd4f8 limited csum items to PAGE_CACHE_SIZE. It used min() with incompatible types in 32bit which generates warnings: fs/btrfs/file-item.c: In function ‘btrfs_csum_file_blocks’: fs/btrfs/file-item.c:717: warning: comparison of distinct pointer types lacks a cast This uses min_t(u32,) to fix the warnings. u32 seemed reasonable because btrfs_root->leafsize is u32 and PAGE_CACHE_SIZE is unsigned long. Signed-off-by: Zach Brown --- fs/btrfs/file-item.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54f03354bed..1ad08e4e4a1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -29,7 +29,8 @@ sizeof(struct btrfs_item) * 2) / \ size) - 1)) -#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) +#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ + PAGE_CACHE_SIZE)) #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ -- cgit v1.2.3 From de0022b9da616b95ea5b41eab32da825b0b5150f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Sep 2012 14:25:58 -0400 Subject: Btrfs: do not async metadata csumming in certain situations There are a coule scenarios where farming metadata csumming off to an async thread doesn't help. The first is if our processor supports crc32c, in which case the csumming will be fast and so the overhead of the async model is not worth the cost. The other case is for our tree log. We will be making that stuff dirty and writing it out and waiting for it immediately. Even with software crc32c this gives me a ~15% increase in speed with O_SYNC workloads. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 22 ++++++++++++++++++++++ fs/btrfs/extent_io.c | 14 ++++++++++++-- fs/btrfs/extent_io.h | 1 + 3 files changed, 35 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dcaf55695e6..aa02eab8c40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,10 @@ #include "check-integrity.h" #include "rcu-string.h" +#ifdef CONFIG_X86 +#include +#endif + static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -859,10 +863,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); } +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ + if (bio_flags & EXTENT_BIO_TREE_LOG) + return 0; +#ifdef CONFIG_X86 + if (cpu_has_xmm4_2) + return 0; +#endif + return 1; +} + static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int async = check_async_write(inode, bio_flags); int ret; if (!(rw & REQ_WRITE)) { @@ -877,6 +893,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return ret; return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); + } else if (!async) { + ret = btree_csum_one_bio(bio); + if (ret) + return ret; + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a2c21570adf..979fa0d6bfe 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -45,6 +45,7 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; + unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -3163,12 +3164,16 @@ static int write_one_eb(struct extent_buffer *eb, struct block_device *bdev = fs_info->fs_devices->latest_bdev; u64 offset = eb->start; unsigned long i, num_pages; + unsigned long bio_flags = 0; int rw = (epd->sync_io ? WRITE_SYNC : WRITE); int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); + if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) + bio_flags = EXTENT_BIO_TREE_LOG; + for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); @@ -3177,7 +3182,8 @@ static int write_one_eb(struct extent_buffer *eb, ret = submit_extent_page(rw, eb->tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, - 0, 0, 0); + 0, epd->bio_flags, bio_flags); + epd->bio_flags = bio_flags; if (ret) { set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); SetPageError(p); @@ -3212,6 +3218,7 @@ int btree_write_cache_pages(struct address_space *mapping, .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; int ret = 0; int done = 0; @@ -3474,7 +3481,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->sync_io) rw = WRITE_SYNC; - ret = submit_one_bio(rw, epd->bio, 0, 0); + ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -3497,6 +3504,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -3521,6 +3529,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, + .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -3560,6 +3569,7 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = extent_write_cache_pages(tree, mapping, wbc, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 512f8da041f..a69dea21904 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -27,6 +27,7 @@ * type for this bio */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_TREE_LOG 2 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ -- cgit v1.2.3 From ce1953325662fa597197ce728e4195582fc21c8d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Sep 2012 15:26:16 -0400 Subject: Btrfs: do not hold the file extent leaf locked when adding extent item For some reason we unlock everything except the leaf we are on, set the path blocking and then add the extent item for the extent we just finished writing. I can't for the life of me figure out why we would want to do this, and the history doesn't really indicate that there was a real reason for it, so just remove it. This will reduce our tree lock contention on heavy writes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e355eb0aea1..d24b6501461 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1822,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, fi, encryption); btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); - btrfs_unlock_up_safe(path, 1); - btrfs_set_lock_blocking(leaf); - btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); inode_add_bytes(inode, num_bytes); -- cgit v1.2.3 From e6138876ad8327250d77291b3262fee356267211 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 27 Sep 2012 17:07:30 -0400 Subject: Btrfs: cache extent state when writing out dirty metadata pages Everytime we write out dirty pages we search for an offset in the tree, convert the bits in the state, and then when we wait we search for the offset again and clear the bits. So for every dirty range in the io tree we are doing 4 rb searches, which is suboptimal. With this patch we are only doing 2 searches for every cycle (modulo weird things happening). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 5 +++-- fs/btrfs/extent_io.c | 43 +++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/extent_io.h | 6 ++++-- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/transaction.c | 14 +++++++++----- fs/btrfs/tree-log.c | 3 ++- 8 files changed, 63 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aa02eab8c40..c69995556f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3572,7 +3572,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); + mark, NULL); if (ret) break; @@ -3627,7 +3627,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3270b108785..ca4aad96f81 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, while (start < end) { ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE); + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); if (ret) break; @@ -5045,7 +5046,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 979fa0d6bfe..e8ee39b7335 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -937,6 +937,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * @end: the end offset in bytes (inclusive) * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist @@ -946,7 +947,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask) + int bits, int clear_bits, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -963,6 +965,15 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + /* * this search will find all the extents that end after * our range starts. @@ -993,6 +1004,7 @@ hit_next: */ if (state->start == start && state->end <= end) { set_state_bits(tree, state, &bits); + cache_state(state, cached_state); state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; @@ -1033,6 +1045,7 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); + cache_state(state, cached_state); state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; @@ -1071,6 +1084,7 @@ hit_next: &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1093,6 +1107,7 @@ hit_next: extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); clear_state_bit(tree, prealloc, &clear_bits, 0); prealloc = NULL; goto out; @@ -1297,18 +1312,42 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) + u64 *start_ret, u64 *end_ret, int bits, + struct extent_state **cached_state) { struct extent_state *state; + struct rb_node *n; int ret = 1; spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && state->tree) { + n = rb_next(&state->rb_node); + while (n) { + state = rb_entry(n, struct extent_state, + rb_node); + if (state->state & bits) + goto got_it; + n = rb_next(n); + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + state = find_first_extent_bit_state(tree, start, bits); +got_it: if (state) { + cache_state(state, cached_state); *start_ret = state->start; *end_ret = state->end; ret = 0; } +out: spin_unlock(&tree->lock); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a69dea21904..7aeb31087f8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -233,13 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask); + int bits, int clear_bits, + struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits); + u64 *start_ret, u64 *end_ret, int bits, + struct extent_state **cached_state); struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, int bits); int extent_invalidatepage(struct extent_io_tree *tree, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b107e68797f..1027b854b90 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, block_group->key.offset)) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) { ret = 0; break; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6e530bb86c9..776f0aa128f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3621,7 +3621,7 @@ next: ret = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret == 0 && start <= key.objectid) { btrfs_release_path(path); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 69139a356f7..77db875b511 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -687,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int err = 0; int werr = 0; struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, - GFP_NOFS); + mark, &cached_state)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + cached_state = NULL; err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; @@ -717,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, int err = 0; int werr = 0; struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT)) { - clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); + EXTENT_NEED_WAIT, &cached_state)) { + clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1d7b3484432..f4b9e54b1da 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2463,7 +2463,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + NULL); if (ret) break; -- cgit v1.2.3 From 6f1fed775316d58ba356b2ce62de600ad00f003a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Sep 2012 11:07:06 -0400 Subject: Btrfs: don't lookup csums for prealloc extents The tree logging stuff was looking up csums to copy over for prealloc extents which is just work we don't need to be doing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f4b9e54b1da..31b46a9e94c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3011,8 +3011,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, continue; found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (found_type == BTRFS_FILE_EXTENT_REG) { u64 ds, dl, cs, cl; ds = btrfs_file_extent_disk_bytenr(src, extent); -- cgit v1.2.3 From 18ec90d63f43cf785f6e73e7a7db546bff3fa380 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2012 11:56:28 -0400 Subject: Btrfs: be smarter about dropping things from the tree log When we truncate existing items in the tree log we've been searching for each individual item and removing them. This is unnecessary churn and searching, just keep track of the slot we are on and how many items we need to delete and delete them all at once. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 31b46a9e94c..179fda96460 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2901,6 +2901,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key key; struct btrfs_key found_key; + int start_slot; key.objectid = objectid; key.type = max_key_type; @@ -2922,8 +2923,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, if (found_key.objectid != objectid) break; - ret = btrfs_del_item(trans, log, path); - if (ret) + found_key.offset = 0; + found_key.type = 0; + ret = btrfs_bin_search(path->nodes[0], &found_key, 0, + &start_slot); + + ret = btrfs_del_items(trans, log, path, start_slot, + path->slots[0] - start_slot + 1); + /* + * If start slot isn't 0 then we don't need to re-search, we've + * found the last guy with the objectid in this tree. + */ + if (ret || start_slot != 0) break; btrfs_release_path(path); } -- cgit v1.2.3 From f0bd95ea72644cbf9210606f6f8edfce13d65d47 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 1 Oct 2012 03:08:37 -0600 Subject: Btrfs: confirmation of value is added before trace_btrfs_get_extent() is called We should confirm the value of extent_map before calling trace_btrfs_get_extent() because the value of extent_map has the possibility of NULL. Signed-off-by: Tsutomu Itoh --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d24b6501461..f92def2467a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5505,7 +5505,8 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, em); + if (em) + trace_btrfs_get_extent(root, em); if (path) btrfs_free_path(path); -- cgit v1.2.3 From 44734ed1ca0c2e66479cedbf773627a9b5a31364 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2012 16:04:19 -0400 Subject: Btrfs: don't commit instead of overcommitting I don't think we have the same problem that this was supposed to fix originally since we can allocate chunks in the enospc path now. This code is causing us to constantly commit the transaction as we get close to using all of our available space in our currently allocated chunks, instead of allocating another chunk and carrying on with life, which is not nice for performance. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ca4aad96f81..3d3e2c17d8d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3907,7 +3907,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; bool flushing = false; - bool committed = false; again: ret = 0; @@ -3970,33 +3969,12 @@ again: (orig_bytes * 2); } - if (ret) { - u64 avail; - - /* - * If we have a lot of space that's pinned, don't bother doing - * the overcommit dance yet and just commit the transaction. - */ - avail = (space_info->total_bytes - space_info->bytes_used) * 8; - do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !committed) { - space_info->flush = 1; - flushing = true; - spin_unlock(&space_info->lock); - ret = may_commit_transaction(root, space_info, - orig_bytes, 1); - if (ret) - goto out; - committed = true; - goto again; - } - - if (can_overcommit(root, space_info, orig_bytes, flush)) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } + if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; } /* -- cgit v1.2.3 From 8d1a1317af439cc7f349907b79198d35feffe179 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 29 Sep 2012 02:07:46 -0600 Subject: btrfs: remove unused function btrfs_insert_some_items() The function btrfs_insert_some_items() would not be called by any other functions, so remove it. Signed-off-by: Robin Dong --- fs/btrfs/ctree.c | 143 ------------------------------------------------------- 1 file changed, 143 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index fcc8c21a623..b3343621100 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4401,149 +4401,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, } } -/* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - * Returns the number of keys that were inserted. - */ -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - int ret = 0; - int slot; - int i; - u32 nritems; - u32 total_data = 0; - u32 total_size = 0; - unsigned int data_end; - struct btrfs_disk_key disk_key; - struct btrfs_key found_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - for (i = 0; i < nr; i++) { - if (total_size + data_size[i] + sizeof(struct btrfs_item) > - BTRFS_LEAF_DATA_SIZE(root)) { - break; - nr = i; - } - total_data += data_size[i]; - total_size += data_size[i] + sizeof(struct btrfs_item); - } - BUG_ON(nr == 0); - - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < total_size) { - for (i = nr; i >= 0; i--) { - total_data -= data_size[i]; - total_size -= data_size[i] + sizeof(struct btrfs_item); - if (total_size < btrfs_leaf_free_space(root, leaf)) - break; - } - nr = i; - } - - slot = path->slots[0]; - BUG_ON(slot < 0); - - if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* figure out how many keys we can insert in here */ - total_data = data_size[0]; - for (i = 1; i < nr; i++) { - if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) - break; - total_data += data_size[i]; - } - nr = i; - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); - } - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); - } - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - data_end = old_data; - } else { - /* - * this sucks but it has to be done, if we are inserting at - * the end of the leaf only insert 1 of the items, since we - * have no way of knowing whats on the next leaf and we'd have - * to drop our current locks to figure it out - */ - nr = 1; - } - - /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); - btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); - data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); - } - btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); - - ret = 0; - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); - } - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -out: - if (!ret) - ret = nr; - return ret; -} - /* * this is a helper for btrfs_insert_empty_items, the main goal here is * to save stack depth by doing the bulk of the work in a function -- cgit v1.2.3 From 7a2d6a64645b38d7040bbd031c7a7b2655f5d976 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 1 Oct 2012 03:07:15 -0600 Subject: Btrfs: remove unnecessary IS_ERR in bio_readpage_error() Because the value of extent_map is only a correct value or NULL, so IS_ERR is unnecessary. Signed-off-by: Tsutomu Itoh --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e8ee39b7335..67fe401c320 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2110,7 +2110,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, } read_unlock(&em_tree->lock); - if (!em || IS_ERR(em)) { + if (!em) { kfree(failrec); return -EIO; } -- cgit v1.2.3 From 479ed9abdbeec5d9ed0005f3bee9c9bc06a102bb Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 29 Sep 2012 02:07:47 -0600 Subject: btrfs: move inline function code to header file When building btrfs from kernel code, it will report: fs/btrfs/extent_io.h:281: warning: 'extent_buffer_page' declared inline after being called fs/btrfs/extent_io.h:281: warning: previous declaration of 'extent_buffer_page' was here fs/btrfs/extent_io.h:280: warning: 'num_extent_pages' declared inline after being called fs/btrfs/extent_io.h:280: warning: previous declaration of 'num_extent_pages' was here because of the wrong declaration of inline functions. Signed-off-by: Robin Dong --- fs/btrfs/extent_io.c | 12 ------------ fs/btrfs/extent_io.h | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 67fe401c320..b82d244a2ef 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3986,18 +3986,6 @@ out: return ret; } -inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - return eb->pages[i]; -} - -inline unsigned long num_extent_pages(u64 start, u64 len) -{ - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); -} - static void __free_extent_buffer(struct extent_buffer *eb) { #if LEAK_DEBUG diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7aeb31087f8..711d12b8002 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -282,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); -unsigned long num_extent_pages(u64 start, u64 len); -struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + return eb->pages[i]; +} static inline void extent_buffer_get(struct extent_buffer *eb) { -- cgit v1.2.3 From 94edf4ae43a5f9405fe3570f670e26ce1b188476 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Sep 2012 14:56:25 -0400 Subject: Btrfs: don't bother committing delayed inode updates when fsyncing We can just copy the in memory inode into the tree log directly, no sense in updating the fs tree so we can copy it into the tree log tree. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 84 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 179fda96460..47911fd1831 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2944,6 +2944,55 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, return ret; } +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode, int log_inode_only) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + + btrfs_set_inode_sequence(leaf, item, inode->i_version); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, 0); + + if (log_inode_only) { + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(leaf, item, 0); + btrfs_set_inode_size(leaf, item, 0); + } else { + btrfs_set_inode_generation(leaf, item, + BTRFS_I(inode)->generation); + btrfs_set_inode_size(leaf, item, inode->i_size); + } + +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, @@ -2990,24 +3039,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); - - if (inode_only == LOG_INODE_EXISTS && - ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], struct btrfs_inode_item); - btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); - - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' - */ - btrfs_set_inode_generation(dst_path->nodes[0], - inode_item, 0); + fill_inode_item(trans, dst_path->nodes[0], inode_item, + inode, inode_only == LOG_INODE_EXISTS); + } else { + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); } + /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -3361,11 +3403,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; + /* Only run delayed items if we are a dir or a new file */ + if (S_ISDIR(inode->i_mode) || + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; + } } mutex_lock(&BTRFS_I(inode)->log_mutex); -- cgit v1.2.3 From 489406626c42c0176fddae2182d33be2cfb9840c Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 7 May 2012 06:35:38 -0600 Subject: btrfs: fix message printing Fix various messages to include newline and module prefix. Signed-off-by: Daniel J Blueman --- fs/btrfs/super.c | 8 ++++---- fs/btrfs/volumes.c | 6 +++--- fs/btrfs/zlib.c | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f8b803f30ed..0fadcdba8ce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -243,7 +243,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); + WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n"); trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -549,11 +549,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag"); + printk(KERN_INFO "btrfs: enabling auto defrag\n"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery"); + printk(KERN_INFO "btrfs: enabling auto recovery\n"); btrfs_set_opt(info->mount_opt, RECOVERY); break; case Opt_skip_balance: @@ -1602,7 +1602,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "misc_deregister failed for control device"); + printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); } static int __init init_btrfs_fs(void) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4f4de51b5bd..dfe5e3a22f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, bdev = blkdev_get_by_path(device->name->str, flags, holder); if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name->str); + printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); goto error; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -3769,7 +3769,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_CRIT "unable to find logical %llu len %llu\n", + printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n", (unsigned long long)logical, (unsigned long long)*length); BUG(); @@ -4226,7 +4226,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, total_devs = bbio->num_stripes; if (map_length < length) { - printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, (unsigned long long)length, (unsigned long long)map_length); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 92c20654cc5..9acb846c3e7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_in = 0; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "deflateInit failed\n"); + printk(KERN_WARNING "btrfs: deflateInit failed\n"); ret = -1; goto out; } @@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws, while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); ret = -1; @@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); + printk(KERN_WARNING "btrfs: inflateInit failed\n"); return -1; } while (workspace->inf_strm.total_in < srclen) { @@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, } if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); + printk(KERN_WARNING "btrfs: inflateInit failed\n"); return -1; } -- cgit v1.2.3 From bedb2cca7252d08c6ca3085826e30f65bdc3b54b Mon Sep 17 00:00:00 2001 From: Andrei Popa Date: Thu, 20 Sep 2012 08:42:11 -0600 Subject: Btrfs: make compress and nodatacow mount options mutually exclusive If a filesystem is mounted with compression and then remounted by adding nodatacow, the compression is disabled but the compress flag is still visible. Also, if a filesystem is mounted with nodatacow and then remounted with compression, nodatacow flag is still present but it's not active. This patch: - removes compress flags and notifies that the compression has been disabled if the filesystem is mounted with nodatacow - removes nodatacow and nodatasum flags if mounted with compress. Signed-off-by: Andrei Popa --- fs/btrfs/super.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0fadcdba8ce..915ac14c206 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -413,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - printk(KERN_INFO "btrfs: setting nodatacow\n"); + if (!btrfs_test_opt(root, COMPRESS) || + !btrfs_test_opt(root, FORCE_COMPRESS)) { + printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); + } else { + printk(KERN_INFO "btrfs: setting nodatacow\n"); + } + info->compress_type = BTRFS_COMPRESS_NONE; + btrfs_clear_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; @@ -428,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; -- cgit v1.2.3 From 62856a9b73860cffe2a3d91b069393b88c219aa6 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 31 Jul 2012 11:09:44 -0600 Subject: Btrfs: detect corrupted filesystem after write I/O errors In check-integrity, detect when a superblock is written that points to blocks that have not been written to disk due to I/O write errors. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9197e2e3340..5a3e45db642 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -37,8 +37,9 @@ * the file system was mounted, (i.e., they have been * referenced by the super block) or they have been * written since then and the write completion callback - * was called and a FLUSH request to the device where - * these blocks are located was received and completed. + * was called and no write error was indicated and a + * FLUSH request to the device where these blocks are + * located was received and completed. * 2b. All referenced blocks need to have a generation * number which is equal to the parent's number. * @@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, (unsigned long long)l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; + } else if (l->block_ref_to->iodone_w_error) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which has write error!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; } else if (l->parent_generation != l->block_ref_to->generation && BTRFSIC_GENERATION_UNKNOWN != -- cgit v1.2.3 From 5af3e8cce8b7ba0a2819e18c9146c8c0b452d479 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 1 Aug 2012 18:56:49 +0200 Subject: Btrfs: make filesystem read-only when submitting barrier fails So far the return code of barrier_all_devices() is ignored, which means that errors are ignored. The result can be a corrupt filesystem which is not consistent. This commit adds code to evaluate the return code of barrier_all_devices(). The normal btrfs_error() mechanism is used to switch the filesystem into read-only mode when errors are detected. In order to decide whether barrier_all_devices() should return error or success, the number of disks that are allowed to fail the barrier submission is calculated. This calculation accounts for the worst RAID level of metadata, system and data. If single, dup or RAID0 is in use, a single disk error is already considered to be fatal. Otherwise a single disk error is tolerated. The calculation of the number of disks that are tolerated to fail the barrier operation is performed when the filesystem gets mounted, when a balance operation is started and finished, and when devices are added or removed. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 5 +++ fs/btrfs/disk-io.c | 109 +++++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/disk-io.h | 2 + fs/btrfs/ioctl.c | 8 ++-- fs/btrfs/tree-log.c | 7 +++- fs/btrfs/volumes.c | 30 +++++++++++++++ 6 files changed, 142 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 50dcd0fbae1..1630be83121 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1468,6 +1468,8 @@ struct btrfs_fs_info { /* next backup root to be overwritten */ int backup_root_index; + + int num_tolerated_disk_barrier_failures; }; /* @@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space); + /* file.c */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c69995556f6..83552368770 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2505,6 +2505,8 @@ retry_root_backup: printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); @@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait) printk_in_rcu("btrfs: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; - } - if (!bio_flagged(bio, BIO_UPTODATE)) { + } else if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; - if (!bio_flagged(bio, BIO_EOPNOTSUPP)) - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ @@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors = 0; + int errors_send = 0; + int errors_wait = 0; int ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_send++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 0); if (ret) - errors++; + errors_send++; } /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_wait++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 1); if (ret) - errors++; + errors_wait++; } - if (errors) + if (errors_send > info->num_tolerated_disk_barrier_failures || + errors_wait > info->num_tolerated_disk_barrier_failures) return -EIO; return 0; } +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ioctl_space_info space; + struct btrfs_space_info *sinfo; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int i; + int c; + int num_tolerated_disk_barrier_failures = + (int)fs_info->fs_devices->num_devices; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + sinfo = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + sinfo = tmp; + break; + } + } + rcu_read_unlock(); + + if (!sinfo) + continue; + + down_read(&sinfo->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&sinfo->block_groups[c])) { + u64 flags; + + btrfs_get_block_group_info( + &sinfo->block_groups[c], &space); + if (space.total_bytes == 0 || + space.used_bytes == 0) + continue; + flags = space.flags; + /* + * return + * 0: if dup, single or RAID0 is configured for + * any of metadata, system or data, else + * 1: if RAID5 is configured, or if RAID1 or + * RAID10 is configured and only two mirrors + * are used, else + * 2: if RAID6 is configured, else + * num_mirrors - 1: if RAID1 or RAID10 is + * configured and more than + * 2 mirrors are used. + */ + if (num_tolerated_disk_barrier_failures > 0 && + ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) + == 0))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 + && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + } + } + up_read(&sinfo->groups_sem); + } + + return num_tolerated_disk_barrier_failures; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; - if (do_barriers) - barrier_all_devices(root->fs_info); + if (do_barriers) { + ret = barrier_all_devices(root->fs_info); + if (ret) { + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + btrfs_error(root->fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c5b00a735fe..2025a9132c1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d6836af6d60..f5a2e6c4320 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return 0; } -static void get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space) +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) { struct btrfs_block_group_cache *block_group; @@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) { - get_block_group_info(&info->block_groups[c], - &space); + btrfs_get_block_group_info( + &info->block_groups[c], &space); memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 47911fd1831..67eab2d4d8a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * in and cause problems either. */ btrfs_scrub_pause_super(root); - write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = write_ctree_super(trans, root->fs_info->tree_root, 1); btrfs_scrub_continue_super(root); - ret = 0; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_wake_log_root; + } mutex_lock(&root->log_mutex); if (root->last_log_commit < log_transid) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dfe5e3a22f5..029b903a4ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) free_fs_devices(cur_devices); } + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + /* * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super @@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_clear_space_info_full(root->fs_info); unlock_chunks(root); + root->fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); ret = btrfs_commit_transaction(trans, root); if (seeding_dev) { @@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + int num_tolerated_disk_barrier_failures; + u64 target = bctl->sys.target; + + num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (num_tolerated_disk_barrier_failures > 0 && + (target & + (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_AVAIL_ALLOC_BIT_SINGLE))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 && + (target & + (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + + fs_info->num_tolerated_disk_barrier_failures = + num_tolerated_disk_barrier_failures; + } + ret = insert_balance_item(fs_info->tree_root, bctl); if (ret && ret != -EEXIST) goto out; @@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, __cancel_balance(fs_info); } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + } + wake_up(&fs_info->balance_wait_q); return ret; -- cgit v1.2.3 From 15e3004a0eb2c4879bc666d0e6a3acd973fb226c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Oct 2012 13:39:50 -0400 Subject: Btrfs: cleanup pages properly when ENOMEM in compression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were freeing non-existent pages which was causing a panic for a user who was suffering from ENOMEM. This patch fixes the problem. Thanks, Reported-by: Jérôme Poulin Signed-off-by: Josef Bacik --- fs/btrfs/compression.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 43d1c5a3a03..c6467aa88be 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_start; struct extent_map *em; int ret = -ENOMEM; + int faili = 0; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!cb->compressed_pages[pg_index]) + if (!cb->compressed_pages[pg_index]) { + faili = pg_index - 1; + ret = -ENOMEM; goto fail2; + } } + faili = nr_pages - 1; cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, return 0; fail2: - for (pg_index = 0; pg_index < nr_pages; pg_index++) - free_page((unsigned long)cb->compressed_pages[pg_index]); + while (faili >= 0) { + __free_page(cb->compressed_pages[faili]); + faili--; + } kfree(cb->compressed_pages); fail1: -- cgit v1.2.3 From edd33c99c4ba26ebe17c1a3d65b4aba25482ed32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Oct 2012 16:40:32 -0400 Subject: Btrfs: don't bug on enomem in readpage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Get rid of the BUG_ON(ret == -ENOMEM) in __extent_read_full_page. Thanks, Reported-by: Jérôme Poulin Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b82d244a2ef..8c37cb64be7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2751,12 +2751,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); - BUG_ON(ret == -ENOMEM); - nr++; - *bio_flags = this_bio_flag; + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } } - if (ret) + if (ret) { SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1); + } cur = cur + iosize; pg_offset += iosize; } -- cgit v1.2.3 From 4804b38293c020e7a2c841e86402f456c19d934d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Oct 2012 16:43:45 -0400 Subject: Btrfs: do not warn_on when we cannot alloc a page for an extent buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's just annoying and the user will have gotten a nice OOM killer message so they are already fully aware they are screwed :). Thanks, Reported-by: Jérôme Poulin Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8c37cb64be7..7dc69b38548 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4249,10 +4249,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); - if (!p) { - WARN_ON(1); + if (!p) goto free_eb; - } spin_lock(&mapping->private_lock); if (PagePrivate(p)) { -- cgit v1.2.3 From f60b1b49f6f72abb8bedfd49b758773bbda043c8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Oct 2012 16:53:34 -0400 Subject: Btrfs: fix page leakage Alloc_dummy_extent_buffer will not free the first page in the eb array if we fail to allocate a page, fix this. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7dc69b38548..64dc93f64bc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4104,7 +4104,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) return eb; err: - for (i--; i > 0; i--) + for (i--; i >= 0; i--) __free_page(eb->pages[i]); __free_extent_buffer(eb); return NULL; -- cgit v1.2.3 From 1037a5affc266fdfe08b2dd415d309ab2778ce6a Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Mon, 8 Oct 2012 07:26:15 -0600 Subject: Btrfs: remove repeated eb->pages check in, disk-io.c/csum_dirty_buffer In csum_dirty_buffer, we first get eb from page->private. Then we check if the page is the first page of eb. Later we check it again. Remove the repeated check here. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/disk-io.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 83552368770..7cda51995c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -433,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); return 0; } - if (eb->pages[0] != page) { - WARN_ON(1); - return 0; - } if (!PageUptodate(page)) { WARN_ON(1); return 0; -- cgit v1.2.3 From f46dbe3dee853f8a860f889cb2b7ff4c624f2a7a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Oct 2012 11:17:20 -0400 Subject: btrfs: init ref_index to zero in add_inode_ref Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 67eab2d4d8a..e9ebb472b28 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1033,7 +1033,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, int log_ref_ver = 0; u64 parent_objectid; u64 inode_objectid; - u64 ref_index; + u64 ref_index = 0; int ref_struct_size; ref_ptr = btrfs_item_ptr_offset(eb, slot); -- cgit v1.2.3 From c43a25abba97c7d87131e71db6be24b24d7791a5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:21 -0400 Subject: audit: reverse arguments to audit_inode_child Most of the callers get called with an inode and dentry in the reverse order. The compiler then has to reshuffle the arg registers and/or stack in order to pass them on to audit_inode_child. Reverse those arguments for a micro-optimization. Reported-by: Eric Paris Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e568c472f80..49f4d59ac2c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -638,7 +638,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) return -ENOENT; BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(victim, dir); + audit_inode_child(dir, victim); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) -- cgit v1.2.3 From 4fa6b5ecbf092c6ee752ece8a55d71f663d23254 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 15:25:25 -0400 Subject: audit: overhaul __audit_inode_child to accomodate retrying In order to accomodate retrying path-based syscalls, we need to add a new "type" argument to audit_inode_child. This will tell us whether we're looking for a child entry that represents a create or a delete. If we find a parent, don't automatically assume that we need to create a new entry. Instead, use the information we have to try to find an existing entry first. Update it if one is found and create a new one if not. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 49f4d59ac2c..61168805f17 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -638,7 +638,7 @@ static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) return -ENOENT; BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(dir, victim); + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) -- cgit v1.2.3 From e9069f470803eeb5e243a05bc717452c6218bd71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 12 Oct 2012 14:25:24 -0700 Subject: btrfs: Fix compilation with user namespace support enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling with user namespace support btrfs fails like: fs/btrfs/tree-log.c: In function ‘fill_inode_item’: fs/btrfs/tree-log.c:2955:2: error: incompatible type for argument 3 of ‘btrfs_set_inode_uid’ fs/btrfs/ctree.h:2026:1: note: expected ‘u32’ but argument is of type ‘kuid_t’ fs/btrfs/tree-log.c:2956:2: error: incompatible type for argument 3 of ‘btrfs_set_inode_gid’ fs/btrfs/ctree.h:2027:1: note: expected ‘u32’ but argument is of type ‘kgid_t’ Fix this by using i_uid_read and i_gid_read in Cc: Chris Mason Cc: Josef Bacik Signed-off-by: "Eric W. Biederman" --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e9ebb472b28..81e407d9677 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,8 +2952,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { - btrfs_set_inode_uid(leaf, item, inode->i_uid); - btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_mode(leaf, item, inode->i_mode); btrfs_set_inode_nlink(leaf, item, inode->i_nlink); -- cgit v1.2.3 From 57911b8ba814fae01306376a0d02bc7cdc88dc94 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 19 Oct 2012 09:22:03 +0200 Subject: Btrfs: don't put removals from push_node_left into tree mod log twice Independant of the check (push_items < src_items) tree_mod_log_eb_copy did log the removal of the old data entries from the source buffer. Therefore, we must not call tree_mod_log_eb_move if the check evaluates to true, as that would log the removal twice, finally resulting in (rewinded) buffers with wrong values for header_nritems. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b3343621100..44a7e25353a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1225,6 +1225,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, free_extent_buffer(eb); __tree_mod_log_rewind(eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); return eb_rewin; } @@ -1280,6 +1282,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) else WARN_ON(btrfs_header_level(eb) != 0); extent_buffer_get(eb); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); return eb; } @@ -2970,8 +2973,10 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { - tree_mod_log_eb_move(root->fs_info, src, 0, push_items, - src_nritems - push_items); + /* + * don't call tree_mod_log_eb_move here, key removal was already + * fully logged by tree_mod_log_eb_copy above. + */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * -- cgit v1.2.3 From ba1bfbd592c1adddd5d0005f587a6e308e25c949 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 22 Oct 2012 20:02:56 +0200 Subject: Btrfs: fix a tree mod logging issue for root replacement operations Avoid the implicit free by tree_mod_log_set_root_pointer, which is wrong in two places. Where needed, we call tree_mod_log_free_eb explicitly now. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 44a7e25353a..e6b75ccd285 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -647,8 +647,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; - __tree_mod_log_free_eb(fs_info, old_root); - ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -926,12 +924,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - /* - * don't log freeing in case we're freeing the root node, this - * is done by tree_mod_log_set_root_pointer later - */ - if (buf != root->node && btrfs_header_level(buf) != 0) - tree_mod_log_free_eb(root->fs_info, buf); + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1728,6 +1721,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); -- cgit v1.2.3 From 834328a8493079d15f30866ace42489463f52571 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 11:27:33 +0200 Subject: Btrfs: tree mod log's old roots could still be part of the tree Tree mod log treated old root buffers as always empty buffers when starting the rewind operations. However, the old root may still be part of the current tree at a lower level, with still some valid entries. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e6b75ccd285..d9308c38a8f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1239,6 +1239,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; + u32 blocksize; eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); @@ -1254,12 +1255,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } tm = tree_mod_log_search(root->fs_info, logical, time_seq); - if (old_root) + if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + blocksize = btrfs_level_size(root, old_root->level); + eb = read_tree_block(root, logical, blocksize, 0); + if (!eb) { + pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", + logical); + WARN_ON(1); + } else { + eb = btrfs_clone_extent_buffer(eb); + } + } else if (old_root) { + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); eb = alloc_dummy_extent_buffer(logical, root->nodesize); - else + } else { eb = btrfs_clone_extent_buffer(root->node); - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + } + if (!eb) return NULL; btrfs_tree_read_lock(eb); -- cgit v1.2.3 From 5b6602e762cae17c8891d19698afea451e9c1d95 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 11:28:27 +0200 Subject: Btrfs: determine level of old roots In btrfs_find_all_roots' termination condition, we compare the level of the old buffer we got from btrfs_search_old_slot to the level of the current root node. We'd better compare it to the level of the rewinded root node. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 4 +--- fs/btrfs/ctree.c | 17 +++++++++++++++++ fs/btrfs/ctree.h | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f3187938e08..65608fbf223 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -283,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - rcu_read_lock(); - root_level = btrfs_header_level(root->node); - rcu_read_unlock(); + root_level = btrfs_old_root_level(root, time_seq); if (root_level + 1 == level) goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d9308c38a8f..ef68c33eefd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1297,6 +1297,23 @@ get_old_root(struct btrfs_root *root, u64 time_seq) return eb; } +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { + level = tm->old_root.level; + } else { + rcu_read_lock(); + level = btrfs_header_level(root->node); + rcu_read_unlock(); + } + + return level; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1630be83121..34c5a442dd3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3120,6 +3120,7 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic_inc_return(&fs_info->tree_mod_seq); } +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, -- cgit v1.2.3 From d638108484d186bf97a838d037f165d9b404e6ee Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 14:21:05 +0200 Subject: Btrfs: fix extent buffer reference for tree mod log roots In get_old_root we grab a lock on the extent buffer before we obtain a reference on that buffer. That order is changed now. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ef68c33eefd..f6739903d07 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1279,6 +1279,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; + extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1291,7 +1292,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) __tree_mod_log_rewind(eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); - extent_buffer_get(eb); WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); return eb; -- cgit v1.2.3 From 01763a2e37425ae3f37a3dc051e0703fdade5956 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 23 Oct 2012 15:02:12 +0200 Subject: Btrfs: comment for loop in tree_mod_log_insert_move Emphasis the way tree_mod_log_insert_move avoids adding MOD_LOG_KEY_REMOVE_WHILE_MOVING operations, depending on the direction of the move operation. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6739903d07..eba44b07682 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -596,6 +596,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, eb)) return 0; + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, MOD_LOG_KEY_REMOVE_WHILE_MOVING); -- cgit v1.2.3 From 661bec6ba884b86517ef5ea529aabb281a7198d9 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Wed, 10 Oct 2012 08:50:47 -0600 Subject: Fix a sign bug causing invalid memory access in the ino_paths ioctl. To see the problem, create many hardlinks to the same file (120 should do it), then look up paths by inode with: ls -i btrfs inspect inode-resolve -v $ino /mnt/btrfs I noticed the memory layout of the fspath->val data had some irregularities (some unnecessary gaps that stop appearing about halfway), so I'm not sure there aren't any bugs left in it. --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f3187938e08..2bcbea3f630 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1186,7 +1186,7 @@ static char *ref_to_path(struct btrfs_root *fs_root, int slot; u64 next_inum; int ret; - s64 bytes_left = size - 1; + s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; int leave_spinning = path->leave_spinning; -- cgit v1.2.3 From 84167d190569eedcdb24bf2499bdda437e442962 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 11 Oct 2012 07:25:16 -0600 Subject: Btrfs: Fix wrong error handling code gcc says "warning: comparison of unsigned expression >= 0 is always true" because i is an unsigned long. And gcc is right this time. Signed-off-by: Stefan Behrens --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 64dc93f64bc..a32ebfeb91c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4104,8 +4104,8 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) return eb; err: - for (i--; i >= 0; i--) - __free_page(eb->pages[i]); + for (; i > 0; i--) + __free_page(eb->pages[i - 1]); __free_extent_buffer(eb); return NULL; } -- cgit v1.2.3 From 96b5bd777118bb673b458b41bbefc7f0f31d65c9 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 15 Oct 2012 08:30:45 +0000 Subject: Btrfs: extended inode refs support for send mechanism This adds support for the new extended inode refs to btrfs send. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 22 +++++----- fs/btrfs/backref.h | 4 ++ fs/btrfs/send.c | 126 ++++++++++++++++++++++++++++++++++------------------- 3 files changed, 94 insertions(+), 58 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2bcbea3f630..b8b69266393 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1177,11 +1177,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, return ret; } -static char *ref_to_path(struct btrfs_root *fs_root, - struct btrfs_path *path, - u32 name_len, unsigned long name_off, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) { int slot; u64 next_inum; @@ -1266,10 +1265,10 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct extent_buffer *eb_in, u64 parent, char *dest, u32 size) { - return ref_to_path(fs_root, path, - btrfs_inode_ref_name_len(eb_in, iref), - (unsigned long)(iref + 1), - eb_in, parent, dest, size); + return btrfs_ref_to_path(fs_root, path, + btrfs_inode_ref_name_len(eb_in, iref), + (unsigned long)(iref + 1), + eb_in, parent, dest, size); } /* @@ -1715,9 +1714,8 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, - name_off, eb, inum, fspath_min, - bytes_left); + fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e75533043a5..d61feca7945 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -62,6 +62,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_inode_ref *iref, struct extent_buffer *eb, u64 parent, char *dest, u32 size); +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c7beb543a4a..4f20c49edd5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -745,31 +745,36 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, void *ctx); /* - * Helper function to iterate the entries in ONE btrfs_inode_ref. + * Helper function to iterate the entries in ONE btrfs_inode_ref or + * btrfs_inode_extref. * The iterate callback may return a non zero value to stop iteration. This can * be a negative value for error codes or 1 to simply stop it. * - * path must point to the INODE_REF when called. + * path must point to the INODE_REF or INODE_EXTREF when called. */ static int iterate_inode_ref(struct send_ctx *sctx, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve, iterate_inode_ref_t iterate, void *ctx) { - struct extent_buffer *eb; + struct extent_buffer *eb = path->nodes[0]; struct btrfs_item *item; struct btrfs_inode_ref *iref; + struct btrfs_inode_extref *extref; struct btrfs_path *tmp_path; struct fs_path *p; - u32 cur; - u32 len; + u32 cur = 0; u32 total; - int slot; + int slot = path->slots[0]; u32 name_len; char *start; int ret = 0; - int num; + int num = 0; int index; + u64 dir; + unsigned long name_off; + unsigned long elem_size; + unsigned long ptr; p = fs_path_alloc_reversed(sctx); if (!p) @@ -781,24 +786,40 @@ static int iterate_inode_ref(struct send_ctx *sctx, return -ENOMEM; } - eb = path->nodes[0]; - slot = path->slots[0]; - item = btrfs_item_nr(eb, slot); - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - cur = 0; - len = 0; - total = btrfs_item_size(eb, item); - num = 0; + if (found_key->type == BTRFS_INODE_REF_KEY) { + ptr = (unsigned long)btrfs_item_ptr(eb, slot, + struct btrfs_inode_ref); + item = btrfs_item_nr(eb, slot); + total = btrfs_item_size(eb, item); + elem_size = sizeof(*iref); + } else { + ptr = btrfs_item_ptr_offset(eb, slot); + total = btrfs_item_size_nr(eb, slot); + elem_size = sizeof(*extref); + } + while (cur < total) { fs_path_reset(p); - name_len = btrfs_inode_ref_name_len(eb, iref); - index = btrfs_inode_ref_index(eb, iref); + if (found_key->type == BTRFS_INODE_REF_KEY) { + iref = (struct btrfs_inode_ref *)(ptr + cur); + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + index = btrfs_inode_ref_index(eb, iref); + dir = found_key->offset; + } else { + extref = (struct btrfs_inode_extref *)(ptr + cur); + name_len = btrfs_inode_extref_name_len(eb, extref); + name_off = (unsigned long)&extref->name; + index = btrfs_inode_extref_index(eb, extref); + dir = btrfs_inode_extref_parent(eb, extref); + } + if (resolve) { - start = btrfs_iref_to_path(root, tmp_path, iref, eb, - found_key->offset, p->buf, - p->buf_len); + start = btrfs_ref_to_path(root, tmp_path, name_len, + name_off, eb, dir, + p->buf, p->buf_len); if (IS_ERR(start)) { ret = PTR_ERR(start); goto out; @@ -809,9 +830,10 @@ static int iterate_inode_ref(struct send_ctx *sctx, p->buf_len + p->buf - start); if (ret < 0) goto out; - start = btrfs_iref_to_path(root, tmp_path, iref, - eb, found_key->offset, p->buf, - p->buf_len); + start = btrfs_ref_to_path(root, tmp_path, + name_len, name_off, + eb, dir, + p->buf, p->buf_len); if (IS_ERR(start)) { ret = PTR_ERR(start); goto out; @@ -820,21 +842,16 @@ static int iterate_inode_ref(struct send_ctx *sctx, } p->start = start; } else { - ret = fs_path_add_from_extent_buffer(p, eb, - (unsigned long)(iref + 1), name_len); + ret = fs_path_add_from_extent_buffer(p, eb, name_off, + name_len); if (ret < 0) goto out; } - - len = sizeof(*iref) + name_len; - iref = (struct btrfs_inode_ref *)((char *)iref + len); - cur += len; - - ret = iterate(num, found_key->offset, index, p, ctx); + cur += elem_size + name_len; + ret = iterate(num, dir, index, p, ctx); if (ret) goto out; - num++; } @@ -998,7 +1015,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, } btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino || - found_key.type != BTRFS_INODE_REF_KEY) { + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { ret = -ENOENT; goto out; } @@ -1551,8 +1569,8 @@ static int get_first_ref(struct send_ctx *sctx, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; - struct btrfs_inode_ref *iref; int len; + u64 parent_dir; path = alloc_path_for_send(); if (!path) @@ -1568,27 +1586,41 @@ static int get_first_ref(struct send_ctx *sctx, if (!ret) btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { + if (ret || found_key.objectid != ino || + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) { ret = -ENOENT; goto out; } - iref = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_ref); - len = btrfs_inode_ref_name_len(path->nodes[0], iref); - ret = fs_path_add_from_extent_buffer(name, path->nodes[0], - (unsigned long)(iref + 1), len); + if (key.type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + iref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], iref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)(iref + 1), + len); + parent_dir = found_key.offset; + } else { + struct btrfs_inode_extref *extref; + extref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_extref); + len = btrfs_inode_extref_name_len(path->nodes[0], extref); + ret = fs_path_add_from_extent_buffer(name, path->nodes[0], + (unsigned long)&extref->name, len); + parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); + } if (ret < 0) goto out; btrfs_release_path(path); - ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL, + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; - *dir = found_key.offset; + *dir = parent_dir; out: btrfs_free_path(path); @@ -3226,7 +3258,8 @@ static int process_all_refs(struct send_ctx *sctx, btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || - found_key.type != key.type) + (found_key.type != BTRFS_INODE_REF_KEY && + found_key.type != BTRFS_INODE_EXTREF_KEY)) break; ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, @@ -3987,7 +4020,7 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) if (sctx->cur_ino == 0) goto out; if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && - sctx->cmp_key->type <= BTRFS_INODE_REF_KEY) + sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) goto out; if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) goto out; @@ -4335,7 +4368,8 @@ static int changed_cb(struct btrfs_root *left_root, if (key->type == BTRFS_INODE_ITEM_KEY) ret = changed_inode(sctx, result); - else if (key->type == BTRFS_INODE_REF_KEY) + else if (key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_INODE_EXTREF_KEY) ret = changed_ref(sctx, result); else if (key->type == BTRFS_XATTR_ITEM_KEY) ret = changed_xattr(sctx, result); -- cgit v1.2.3 From d79e50433b2bea09eb680ed5fae15e8a12356353 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 15 Oct 2012 18:28:46 +0000 Subject: Btrfs: send correct rdev and mode in btrfs-send When sending a device file, the stream was missing the mode. Also the rdev was encoded wrongly. Signed-off-by: Arne Jansen --- fs/btrfs/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4f20c49edd5..97cc5399306 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2462,7 +2462,8 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { - TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev); + TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); } ret = send_cmd(sctx); -- cgit v1.2.3 From 5b7ff5b3c4468780b15c6b20cd0567cd9f2aa626 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 16 Oct 2012 05:44:21 +0000 Subject: Btrfs: fix memory leak in btrfs_quota_enable() We should free quota_root before returning from the error handling code. Signed-off-by: Tsutomu Itoh --- fs/btrfs/qgroup.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5039686df6a..fe9d02c45f8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -790,8 +790,10 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out_free_root; + } key.objectid = 0; key.type = BTRFS_QGROUP_STATUS_KEY; @@ -800,7 +802,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*ptr)); if (ret) - goto out; + goto out_free_path; leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], @@ -818,8 +820,15 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, fs_info->quota_root = quota_root; fs_info->pending_quota_state = 1; spin_unlock(&fs_info->qgroup_lock); -out: +out_free_path: btrfs_free_path(path); +out_free_root: + if (ret) { + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); + } +out: return ret; } -- cgit v1.2.3 From e515c18bfef718a7900924d50198d968565dd60e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Oct 2012 09:34:36 +0000 Subject: btrfs: Return EINVAL when length to trim is less than FSB Currently if len argument in btrfs_ioctl_fitrim() is smaller than one FSB we will continue and finally return 0 bytes discarded. However if the length to discard is smaller then file system block we should really return EINVAL. Signed-off-by: Lukas Czerner --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f5a2e6c4320..da518ded34b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -343,7 +343,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; - if (range.start > total_bytes) + if (range.start > total_bytes || + range.len < fs_info->sb->s_blocksize) return -EINVAL; range.len = min(range.len, total_bytes - range.start); -- cgit v1.2.3 From 671415b7db49f62896f0b6d50fc4f312a0512983 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:26:46 +0000 Subject: Btrfs: fix deadlock caused by the nested chunk allocation Steps to reproduce: # mkfs.btrfs -m raid1 # btrfstune -S 1 # mount # btrfs device add # mount -o remount,rw # dd if=/dev/zero of=/tmpfile bs=1M count=1 Deadlock happened. It is because of the nested chunk allocation. When we wrote the data into the filesystem, we would allocate the data chunk because there was no data chunk in the filesystem. At the end of the data chunk allocation, we should insert the metadata of the data chunk into the extent tree, but there was no raid1 chunk, so we tried to lock the chunk allocation mutex to allocate the new chunk, but we had held the mutex, the deadlock happened. By rights, we would allocate the raid1 chunk when we added the second device because the profile of the seed filesystem is raid1 and we had two devices. But we didn't do that in fact. It is because the last step of the first device insertion didn't commit the transaction. So when we added the second device, we didn't cow the tree, and just inserted the relative metadata into the leaves which were generated by the first device insertion, and its profile was dup. So, I fix this problem by commiting the transaction at the end of the first device insertion. Signed-off-by: Miao Xie --- fs/btrfs/volumes.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 029b903a4ae..0f5ebb72a5e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1819,6 +1819,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) == -ENOENT) + return 0; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); } return ret; -- cgit v1.2.3 From e2d044fe77f8e845333bb1bd29587dc08a4346a0 Mon Sep 17 00:00:00 2001 From: Alex Lyakas Date: Wed, 17 Oct 2012 13:52:47 +0000 Subject: Btrfs: Send: preserve ownership (uid and gid) also for symlinks. This patch also requires a change in the user-space part of "receive". We need to use "lchown" instead of "chown". We will do this in the following patch. Signed-off-by: Alex Lyakas if (S_ISREG(sctx->cur_inode_mode)) { --- fs/btrfs/send.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 97cc5399306..e78b297b0b0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4067,22 +4067,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; - if (!S_ISLNK(sctx->cur_inode_mode)) { - if (!sctx->parent_root || sctx->cur_inode_new) { + if (!sctx->parent_root || sctx->cur_inode_new) { + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode)) need_chmod = 1; - need_chown = 1; - } else { - ret = get_inode_info(sctx->parent_root, sctx->cur_ino, - NULL, NULL, &right_mode, &right_uid, - &right_gid, NULL); - if (ret < 0) - goto out; + } else { + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, + NULL, NULL, &right_mode, &right_uid, + &right_gid, NULL); + if (ret < 0) + goto out; - if (left_uid != right_uid || left_gid != right_gid) - need_chown = 1; - if (left_mode != right_mode) - need_chmod = 1; - } + if (left_uid != right_uid || left_gid != right_gid) + need_chown = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) + need_chmod = 1; } if (S_ISREG(sctx->cur_inode_mode)) { -- cgit v1.2.3 From be6aef604920406b348acf3be6e6e8db55696386 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:43:12 -0400 Subject: Btrfs: Use btrfs_update_inode_fallback when creating a snapshot On a really full file system I was getting ENOSPC back from btrfs_update_inode when trying to update the parent inode when creating a snapshot. Just use the fallback method so we can update the inode and not have to worry about having a delayed ref. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/inode.c | 7 +++---- fs/btrfs/transaction.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1630be83121..8a92ab1632a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3338,6 +3338,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f92def2467a..5fc09908f20 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -94,8 +94,6 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -2746,8 +2744,9 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, return btrfs_update_inode_item(trans, root, inode); } -static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) +noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) { int ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 77db875b511..04bbfb1052e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1200,7 +1200,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, parent_root, parent_inode); + ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) btrfs_abort_transaction(trans, root, ret); fail: -- cgit v1.2.3 From 7bfdcf7fbad56c0f1fc3e2d26431bed72bdcce2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Oct 2012 07:30:19 -0600 Subject: Btrfs: fix memory leak when cloning root's node After cloning root's node, we forgot to dec the src's ref which can lead to a memory leak. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eba44b07682..cdfb4c49a80 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1241,6 +1241,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; struct extent_buffer *eb; + struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; @@ -1264,13 +1265,14 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(root->node); free_extent_buffer(root->node); blocksize = btrfs_level_size(root, old_root->level); - eb = read_tree_block(root, logical, blocksize, 0); - if (!eb) { + old = read_tree_block(root, logical, blocksize, 0); + if (!old) { pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", logical); WARN_ON(1); } else { - eb = btrfs_clone_extent_buffer(eb); + eb = btrfs_clone_extent_buffer(old); + free_extent_buffer(old); } } else if (old_root) { btrfs_tree_read_unlock(root->node); -- cgit v1.2.3 From c37b2b6269ee4637fb7cdb5da0d1e47215d57ce2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:51:44 -0400 Subject: Btrfs: do not bug when we fail to commit the transaction We BUG if we fail to commit the transaction when creating a snapshot, which is just obnoxious. Remove the BUG_ON(). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index da518ded34b..84bb4de1bb8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -571,7 +571,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - BUG_ON(ret); + if (ret) + goto fail; ret = pending_snapshot->error; if (ret) -- cgit v1.2.3 From 52b1de91ea07d1f5600b3cae789d5be9a1f61787 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 30 Oct 2012 17:13:52 +0800 Subject: btrfs: unpin_extent_cache: fix the typo and unnecessary arguements - unpint->unpin - prealloc is no more used Signed-off-by: Liu Bo Signed-off-by: Jiri Kosina --- fs/btrfs/extent_map.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d5c7f..ce9f7921672 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -234,12 +234,11 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) } /** - * unpint_extent_cache - unpin an extent from the cache + * unpin_extent_cache - unpin an extent from the cache * @tree: tree to unpin the extent in * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in - * @prealloc: if this is set we need to clear the prealloc flag * * Called after an extent has been written to disk properly. Set the generation * to the generation that actually added the file item to the inode so we know -- cgit v1.2.3 From 926ccfef82413623ae0e5c3a32d0cad2b9f33e96 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 1 Nov 2012 00:16:32 +0900 Subject: Btrfs: Fix printk and variable name Correct spelling typo in btrfs. Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- fs/btrfs/ctree.h | 2 +- fs/btrfs/volumes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c72ead86950..596617ecd32 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -413,7 +413,7 @@ struct btrfs_root_backup { __le64 bytes_used; __le64 num_devices; /* future */ - __le64 unsed_64[4]; + __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0f5ebb72a5e..e3c6ee3cc2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4294,7 +4294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_size); -- cgit v1.2.3 From 48fc7f7e787dd65ffe88521bce31f4062ba273eb Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Wed, 19 Sep 2012 21:48:00 -0400 Subject: Fix misspellings of "whether" in comments. "Whether" is misspelled in various comments across the tree; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/ordered-data.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d3e2c17d8d..06b2635073f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3888,7 +3888,7 @@ static int flush_space(struct btrfs_root *root, * @root - the root we're allocating for * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want - * @flush - wether or not we can flush to make our reservation + * @flush - whether or not we can flush to make our reservation * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dd27a0b46a3..853fc7beedf 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -76,7 +76,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent * has done its due diligence in updating * the isize. */ -- cgit v1.2.3 From 3fed40cc97f32bebfd34a55364de9b44dcbede59 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 13 Sep 2012 04:51:36 -0600 Subject: Btrfs: cleanup duplicated division functions div_factor{_fine} has been implemented for two times, cleanup it. And I move them into a independent file named math.h because they are common math functions. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 19 +------------------ fs/btrfs/math.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 23 +---------------------- 3 files changed, 46 insertions(+), 40 deletions(-) create mode 100644 fs/btrfs/math.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d3e2c17d8d..7563db782ab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "volumes.h" #include "locking.h" #include "free-space-cache.h" +#include "math.h" #undef SCRAMBLE_DELAYED_REFS @@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - do_div(num, 100); - return num; -} - u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner) { diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h new file mode 100644 index 00000000000..b7816cefbd1 --- /dev/null +++ b/fs/btrfs/math.h @@ -0,0 +1,44 @@ + +/* + * Copyright (C) 2012 Fujitsu. All rights reserved. + * Written by Miao Xie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_MATH_H +#define __BTRFS_MATH_H + +#include + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0f5ebb72a5e..a8adf268647 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "compat.h" #include "ctree.h" #include "extent_map.h" @@ -36,6 +35,7 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" +#include "math.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2338,18 +2338,6 @@ static int chunk_profiles_filter(u64 chunk_type, return 1; } -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor <= 0) - return 0; - if (factor >= 100) - return num; - - num *= factor; - do_div(num, 100); - return num; -} - static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { @@ -2514,15 +2502,6 @@ static int should_balance_chunk(struct btrfs_root *root, return 1; } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; -- cgit v1.2.3 From 561c294d4cfb30c4acfa0a243448fc55af730d87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:32:18 +0000 Subject: Btrfs: fix wrong comment in can_overcommit() The comment is not coincident with the code. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7563db782ab..2cfcce290ab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3668,9 +3668,9 @@ static int can_overcommit(struct btrfs_root *root, avail >>= 1; /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. */ if (flush) avail >>= 3; -- cgit v1.2.3 From 08e007d2e57744472a9424735a368ffe6d625597 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Oct 2012 11:33:38 +0000 Subject: Btrfs: improve the noflush reservation In some places(such as: evicting inode), we just can not flush the reserved space of delalloc, flushing the delayed directory index and delayed inode is OK, but we don't try to flush those things and just go back when there is no enough space to be reserved. This patch fixes this problem. We defined 3 types of the flush operations: NO_FLUSH, FLUSH_LIMIT and FLUSH_ALL. If we can in the transaction, we should not flush anything, or the deadlock would happen, so use NO_FLUSH. If we flushing the reserved space of delalloc would cause deadlock, use FLUSH_LIMIT. In the other cases, FLUSH_ALL is used, and we will flush all things. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 26 ++++++++----- fs/btrfs/delayed-inode.c | 6 ++- fs/btrfs/extent-tree.c | 97 ++++++++++++++++++++++++------------------------ fs/btrfs/inode-map.c | 5 ++- fs/btrfs/inode.c | 5 ++- fs/btrfs/relocation.c | 12 ++++-- fs/btrfs/transaction.c | 30 +++++++-------- fs/btrfs/transaction.h | 2 +- 8 files changed, 97 insertions(+), 86 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c72ead86950..8fd9fe4282f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2900,6 +2900,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + /* + * Flushing delalloc may cause deadlock somewhere, in this + * case, use FLUSH LIMIT + */ + BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_ALL, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2919,19 +2931,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 478f66bdc57..0c6dca550ea 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); /* * Since we're under a transaction reserve_metadata_bytes could * try to commit the transaction which will make it return @@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata( * reserve something strictly for us. If not be a pain and try * to steal from the delalloc block rsv. */ - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (!ret) goto out; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2cfcce290ab..2136adda2a0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3644,7 +3644,7 @@ out: static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, - int flush) + enum btrfs_reserve_flush_enum flush) { u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; @@ -3672,7 +3672,7 @@ static int can_overcommit(struct btrfs_root *root, * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ - if (flush) + if (flush == BTRFS_RESERVE_FLUSH_ALL) avail >>= 3; else avail >>= 1; @@ -3696,6 +3696,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + enum btrfs_reserve_flush_enum flush; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -3723,8 +3724,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, wait_event(root->fs_info->async_submit_wait, !atomic_read(&root->fs_info->async_delalloc_pages)); + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(root, space_info, orig, !trans)) { + if (can_overcommit(root, space_info, orig, flush)) { spin_unlock(&space_info->lock); break; } @@ -3882,7 +3887,8 @@ static int flush_space(struct btrfs_root *root, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; @@ -3895,10 +3901,11 @@ again: ret = 0; spin_lock(&space_info->lock); /* - * We only want to wait if somebody other than us is flushing and we are - * actually alloed to flush. + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. */ - while (flush && !flushing && space_info->flush) { + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { spin_unlock(&space_info->lock); /* * If we have a trans handle we can't wait because the flusher @@ -3964,23 +3971,40 @@ again: * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. */ - if (ret && flush) { + if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { flushing = true; space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret || !flush) + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) goto out; ret = flush_space(root, space_info, num_bytes, orig_bytes, flush_state); flush_state++; + + /* + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. + */ + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; + if (!ret) goto again; - else if (flush_state <= COMMIT_TRANS) + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) goto again; out: @@ -4131,9 +4155,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -static inline int __block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int flush) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) { int ret; @@ -4149,20 +4173,6 @@ static inline int __block_rsv_add(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 0); -} - int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { @@ -4181,9 +4191,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int flush) +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -4211,20 +4221,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) @@ -4515,14 +4511,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 csum_bytes; unsigned nr_extents = 0; int extra_reserve = 0; - int flush = 1; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret; /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(inode)) - flush = 0; + flush = BTRFS_RESERVE_NO_FLUSH; - if (flush && btrfs_transaction_in_commit(root->fs_info)) + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); mutex_lock(&BTRFS_I(inode)->delalloc_mutex); @@ -6252,7 +6249,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -6279,7 +6277,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); WARN_ON(1); } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b1a1c929ba8..d26f67a59e3 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 3 items for pre-allocation */ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); - ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, - trans->bytes_reserved); + ret = btrfs_block_rsv_add(root, trans->block_rsv, + trans->bytes_reserved, + BTRFS_RESERVE_NO_FLUSH); if (ret) goto out; trace_btrfs_space_reservation(root->fs_info, "ino_cache", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95542a1b3df..db3dd4ed057 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3829,7 +3829,8 @@ void btrfs_evict_inode(struct inode *inode) * inode item when doing the truncate. */ while (1) { - ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); /* * Try and steal from the global reserve since we will @@ -3847,7 +3848,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_noflush(root, 1); + trans = btrfs_start_transaction_lflush(root, 1); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 776f0aa128f..242d6de4d8e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2074,7 +2074,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2184,7 +2185,8 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) err = ret; } @@ -2459,7 +2461,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -3685,7 +3688,8 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256); + rc->extent_root->nodesize * 256, + BTRFS_RESERVE_FLUSH_ALL); if (ret) return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04bbfb1052e..4e1def4c06b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -295,9 +295,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type) return 0; } -static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - u64 num_items, int type, - int noflush) +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, u64 num_items, int type, + enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -331,14 +331,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, } num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - if (noflush) - ret = btrfs_block_rsv_add_noflush(root, - &root->fs_info->trans_block_rsv, - num_bytes); - else - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, + &root->fs_info->trans_block_rsv, + num_bytes, flush); if (ret) return ERR_PTR(ret); } @@ -422,13 +417,15 @@ got_it: struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START, 0); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL); } -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_root *root, int num_items) { - return start_transaction(root, num_items, TRANS_START, 1); + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_LIMIT); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) @@ -1032,8 +1029,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add(root, &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); if (ret) { pending->error = ret; goto no_free_objectid; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 80961947a6b..0e8aa1e6c28 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); -- cgit v1.2.3 From de1ee92ac3bce4c9d760016c4d6198158e6e2f15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Oct 2012 16:50:56 -0400 Subject: Btrfs: recheck bio against block device when we map the bio Alex reported a problem where we were writing between chunks on a rbd device. The thing is we do bio_add_page using logical offsets, but the physical offset may be different. So when we map the bio now check to see if the bio is still ok with the physical offset, and if it is not split the bio up and redo the bio_add_page with the physical sector. This fixes the problem for Alex and doesn't affect performance in the normal case. Thanks, Reported-and-tested-by: Alex Elder Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 159 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 131 insertions(+), 28 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8adf268647..eaaf0bf5279 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4217,6 +4217,113 @@ static noinline void schedule_bio(struct btrfs_root *root, &device->work); } +static int bio_size_ok(struct block_device *bdev, struct bio *bio, + sector_t sector) +{ + struct bio_vec *prev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned short max_sectors = queue_max_sectors(q); + struct bvec_merge_data bvm = { + .bi_bdev = bdev, + .bi_sector = sector, + .bi_rw = bio->bi_rw, + }; + + if (bio->bi_vcnt == 0) { + WARN_ON(1); + return 1; + } + + prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if ((bio->bi_size >> 9) > max_sectors) + return 0; + + if (!q->merge_bvec_fn) + return 1; + + bvm.bi_size = bio->bi_size - prev->bv_len; + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) + return 0; + return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *bio, u64 physical, int dev_nr, + int rw, int async) +{ + struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + + bio->bi_private = bbio; + bio->bi_private = merge_stripe_index_into_bio_private( + bio->bi_private, (unsigned int)dev_nr); + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = physical >> 9; +#ifdef DEBUG + { + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); + } +#endif + bio->bi_bdev = dev->bdev; + if (async) + schedule_bio(root, dev, rw, bio); + else + btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, + struct bio *first_bio, struct btrfs_device *dev, + int dev_nr, int rw, int async) +{ + struct bio_vec *bvec = first_bio->bi_io_vec; + struct bio *bio; + int nr_vecs = bio_get_nr_vecs(dev->bdev); + u64 physical = bbio->stripes[dev_nr].physical; + +again: + bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); + if (!bio) + return -ENOMEM; + + while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { + if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len) { + u64 len = bio->bi_size; + + atomic_inc(&bbio->stripes_pending); + submit_stripe_bio(root, bbio, bio, physical, dev_nr, + rw, async); + physical += len; + goto again; + } + bvec++; + } + + submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); + return 0; +} + +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ + atomic_inc(&bbio->error); + if (atomic_dec_and_test(&bbio->stripes_pending)) { + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; + bio->bi_sector = logical >> 9; + kfree(bbio); + bio_endio(bio, -EIO); + } +} + int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { @@ -4255,40 +4362,36 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { + dev = bbio->stripes[dev_nr].dev; + if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + bbio_error(bbio, first_bio, logical); + dev_nr++; + continue; + } + + /* + * Check and see if we're ok with this bio based on it's size + * and offset with the given device. + */ + if (!bio_size_ok(dev->bdev, first_bio, + bbio->stripes[dev_nr].physical >> 9)) { + ret = breakup_stripe_bio(root, bbio, first_bio, dev, + dev_nr, rw, async_submit); + BUG_ON(ret); + dev_nr++; + continue; + } + if (dev_nr < total_devs - 1) { bio = bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; } - bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); - bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; - dev = bbio->stripes[dev_nr].dev; - if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { -#ifdef DEBUG - struct rcu_string *name; - - rcu_read_lock(); - name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - name->str, dev->devid, bio->bi_size); - rcu_read_unlock(); -#endif - bio->bi_bdev = dev->bdev; - if (async_submit) - schedule_bio(root, dev, rw, bio); - else - btrfsic_submit_bio(rw, bio); - } else { - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; - bio->bi_sector = logical >> 9; - bio_endio(bio, -EIO); - } + + submit_stripe_bio(root, bbio, bio, + bbio->stripes[dev_nr].physical, dev_nr, rw, + async_submit); dev_nr++; } return 0; -- cgit v1.2.3 From de6c4115a297d4bbf178aca9948c3539f89c9caa Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 18 Oct 2012 08:18:01 +0000 Subject: Btrfs: fix unnecessary while loop when search the free space, cache When we find a bitmap free space entry, we may check the previous extent entry covers the offset or not. But if we find this entry is also a bitmap entry, we will continue to check the previous entry of the current one by a while loop. It is unnecessary because it is impossible that the extent entry which is in front of a bitmap entry can cover the offset of the entry after that bitmap entry. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1027b854b90..557502ca1a2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1250,18 +1250,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, * if previous extent entry covers the offset, * we should return it instead of the bitmap entry */ - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - entry = prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + entry = prev; } } return entry; @@ -1287,18 +1282,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, } if (entry->bitmap) { - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; + n = rb_prev(&entry->offset_index); + if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - return prev; - break; - } + if (!prev->bitmap && + prev->offset + prev->bytes > offset) + return prev; } if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) return entry; -- cgit v1.2.3 From 95c80bb1f6b24b57058d971ed252b2c1c5121b51 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:52 +0000 Subject: Btrfs: MOD_LOG_KEY_REMOVE_WHILE_MOVING never change node's nritems Key MOD_LOG_KEY_REMOVE_WHILE_MOVING means that we're doing memmove inside an extent buffer node, and the node's number of items remains unchanged (unless we are inserting a single pointer, but we have MOD_LOG_KEY_ADD for that). So we don't need to increase node's number of items during rewinding, otherwise we may get an node larger than leafsize and cause general protection errors later. Here is the details, - If we do memory move for inserting a single pointer, we need to add node's nritems by one, and we honor MOD_LOG_KEY_ADD for adding. - If we do memory move for deleting a single pointer, we need to decrease node's nritems by one, and we honor MOD_LOG_KEY_REMOVE for deleting. - If we do memory move for balance left/right, we need to decrease node's nritems, and we honor MOD_LOG_KEY_REMOVE for balaning. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdfb4c49a80..b12c0395916 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1140,13 +1140,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: + n++; + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: btrfs_set_node_key(eb, &tm->key, tm->slot); btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); - n++; break; case MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); -- cgit v1.2.3 From 6a7a665d78c5dd8bc76a010648c4e7d84517ab5a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:53 +0000 Subject: Btrfs: reorder tree mod log operations in deleting a pointer Since we don't use MOD_LOG_KEY_REMOVE_WHILE_MOVING to add nritems during rewinding, we should insert a MOD_LOG_KEY_REMOVE operation first. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b12c0395916..4d518bd7751 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4609,6 +4609,12 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; int ret; + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } + nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (tree_mod_log && level) @@ -4619,10 +4625,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); - } else if (tree_mod_log && level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); } nritems--; -- cgit v1.2.3 From 0e411ecec60138f22442728f036d38cfea007817 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 09:50:54 +0000 Subject: Btrfs: kill unnecessary arguments in del_ptr The argument 'tree_mod_log' is not necessary since all of callers enable it. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4d518bd7751..615b74968fa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log); + struct btrfs_path *path, int level, int slot); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, @@ -1827,7 +1826,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1, 1); + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1871,7 +1870,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot, 1); + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -4602,14 +4601,13 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log) + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; int ret; - if (tree_mod_log && level) { + if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); @@ -4617,7 +4615,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { - if (tree_mod_log && level) + if (level) tree_mod_log_eb_move(root->fs_info, parent, slot, slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, @@ -4658,7 +4656,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1], 1); + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we -- cgit v1.2.3 From 32adf0901371c8b9d258dba7811f3067d1d2ea5c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 19 Oct 2012 12:52:15 +0000 Subject: Btrfs: cleanup unused arguments 'disk_key' is not used at all. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 615b74968fa..100c274a1cf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -775,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, static noinline void tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int slot, int atomic) + struct extent_buffer *eb, int slot, int atomic) { int ret; @@ -1835,7 +1834,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &right_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1879,7 +1878,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + tree_mod_log_set_node_key(root->fs_info, parent, pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1979,7 +1978,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot, 0); + pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -2032,7 +2031,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2916,7 +2915,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) -- cgit v1.2.3 From 7b398f8e58c415738e397645c926253c428cf002 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Oct 2012 15:52:28 -0400 Subject: Btrfs: fill the global reserve when unpinning space Dave gave me an image of a very full file system that would abort the transaction because it ran out of space while committing the transaction. This is because we would think there was plenty of room to create a snapshot even though the global reserve was not full. This happens because we calculate the global reserve size before we unpin any space, so after we unpin the space we allow reservations to occur even though we haven't reserved all of the space for our global reserve. Fix this by adding to the global reserve while unpinning in order to make sure we always have enough space to do our work. With this patch we no longer end up with an aborted transaction, we return ENOSPC properly to the person trying to create the snapshot. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2136adda2a0..b495cb4b9b2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4949,9 +4949,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 len; + bool readonly; while (start <= end) { + readonly = false; if (!cache || start >= cache->key.objectid + cache->key.offset) { if (cache) @@ -4969,15 +4973,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } start += len; + space_info = cache->space_info; - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - if (cache->ro) - cache->space_info->bytes_readonly += len; + space_info->bytes_pinned -= len; + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); } if (cache) -- cgit v1.2.3 From 8ccf6f19b67f7e0921063cc309f4672a6afcb528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:28:04 +0000 Subject: Btrfs: make delalloc inodes be flushed by multi-task This patch introduce a new worker pool named "flush_workers", and if we want to force all the inode with pending delalloc to the disks, we can queue those inodes into the work queue of the worker pool, in this way, those inodes will be flushed by multi-task. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 +++++++++ fs/btrfs/disk-io.c | 7 +++++ fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/relocation.c | 6 +++- fs/btrfs/transaction.c | 6 +++- 5 files changed, 103 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd9fe4282f..cad16566da3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1333,6 +1333,7 @@ struct btrfs_fs_info { struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; + struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; @@ -3277,6 +3278,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1..bd70c2852ba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2279,6 +2279,10 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), @@ -2350,6 +2354,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->delayed_workers); ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); + ret |= btrfs_start_workers(&fs_info->flush_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2667,6 +2672,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->flush_workers); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3339,6 +3345,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db3dd4ed057..dce9e218b84 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; @@ -7204,6 +7205,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); + if (btrfs_delalloc_work_cachep) + kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) @@ -7238,6 +7241,13 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", + sizeof(struct btrfs_delalloc_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_delalloc_work_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -7448,6 +7458,49 @@ out_notrans: return ret; } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + if (delalloc_work->wait) + btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); + else + filemap_flush(delalloc_work->inode->i_mapping); + + if (delalloc_work->delay_iput) + btrfs_add_delayed_iput(delalloc_work->inode); + else + iput(delalloc_work->inode); + complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput) +{ + struct btrfs_delalloc_work *work; + + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->delay_iput = delay_iput; + work->work.func = btrfs_run_delalloc_work; + + return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_delalloc_work_cachep, work); +} + /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. @@ -7457,10 +7510,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; + struct btrfs_delalloc_work *work, *next; + struct list_head works; + int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + INIT_LIST_HEAD(&works); + spin_lock(&root->fs_info->delalloc_lock); while (!list_empty(head)) { binode = list_entry(head->next, struct btrfs_inode, @@ -7470,11 +7528,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&binode->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - filemap_flush(inode->i_mapping); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (!work) { + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -7493,7 +7554,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + return ret; } static int btrfs_symlink(struct inode *dir, struct dentry *dentry, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 242d6de4d8e..270f24ffe1b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4061,7 +4061,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + if (ret < 0) { + err = ret; + goto out; + } btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4e1def4c06b..9c466f9f817 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1497,7 +1497,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } btrfs_wait_ordered_extents(root, 1); } -- cgit v1.2.3 From 25287e0a16c0ad068aa89ab01aea6c699b31ec12 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:31:03 +0000 Subject: Btrfs: make ordered operations be handled by multi-task The process of the ordered operations is similar to the delalloc inode flush, so we handle them by flush workers. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 46 ++++++++++++++++++++++++++++++---------------- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/transaction.c | 18 ++++++++++++++---- 3 files changed, 45 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7772f02ba28..ab2a3c0c540 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -519,13 +519,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; struct list_head splice; + struct list_head works; + struct btrfs_delalloc_work *work, *next; + int ret = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); @@ -533,6 +537,7 @@ again: list_splice_init(&root->fs_info->ordered_operations, &splice); while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -549,15 +554,26 @@ again: list_add_tail(&BTRFS_I(inode)->ordered_operations, &root->fs_info->ordered_operations); } + + if (!inode) + continue; spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); + work = btrfs_alloc_delalloc_work(inode, wait, 1); + if (!work) { + if (list_empty(&BTRFS_I(inode)->ordered_operations)) + list_add_tail(&btrfs_inode->ordered_operations, + &splice); + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_tail(&splice, + &root->fs_info->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); @@ -566,7 +582,13 @@ again: goto again; spin_unlock(&root->fs_info->ordered_extent_lock); +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } mutex_unlock(&root->fs_info->ordered_operations_mutex); + return ret; } /* @@ -934,15 +956,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - /* - * the transaction is already committing. Just start the IO and - * don't bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return; - } - spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, @@ -959,6 +972,7 @@ int __init ordered_data_init(void) NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; + return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dd27a0b46a3..e8dcec63511 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -186,7 +186,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9c466f9f817..259f74eabdb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1412,15 +1412,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret = -EIO; + int ret; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(root, 0); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } - if (cur_trans->aborted) + if (cur_trans->aborted) { + ret = cur_trans->aborted; goto cleanup_transaction; + } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here @@ -1523,7 +1529,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); -- cgit v1.2.3 From 9afab8820bb8b55af669b199597d6716e04d1ba8 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 25 Oct 2012 09:41:36 +0000 Subject: Btrfs: make ordered extent be flushed by multi-task Though the process of the ordered extents is a bit different with the delalloc inode flush, but we can see it as a subset of the delalloc inode flush, so we also handle them by flush workers. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 41 +++++++++++++++++++++++++++++++++-------- fs/btrfs/ordered-data.h | 5 ++++- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ab2a3c0c540..eecc20f14cf 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); trace_btrfs_ordered_extent_add(inode, entry); @@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode, wake_up(&entry->wait); } +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered->inode, ordered, 1); + complete(&ordered->completion); +} + /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { - struct list_head splice; + struct list_head splice, works; struct list_head *cur; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *ordered, *next; struct inode *inode; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); @@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->ordered_extent_lock); if (inode) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); } else { btrfs_put_ordered_extent(ordered); } + cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); + + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + + inode = ordered->inode; + btrfs_put_ordered_extent(ordered); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + + cond_resched(); + } } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e8dcec63511..efc7c2930c1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -128,8 +128,11 @@ struct btrfs_ordered_extent { struct list_head root_extent_list; struct btrfs_work work; -}; + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; /* * calculates the total size you need to allocate for an ordered sum -- cgit v1.2.3 From d0e1d66b5aa1ec9f556f951aa9a114cc192cd01c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 11 Dec 2012 16:00:21 -0800 Subject: writeback: remove nr_pages_dirtied arg from balance_dirty_pages_ratelimited_nr() There is no reason to pass the nr_pages_dirtied argument, because nr_pages_dirtied value from the caller is unused in balance_dirty_pages_ratelimited_nr(). Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/file.c | 3 +-- fs/btrfs/ioctl.c | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1..22a0439e5a8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } @@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed8811..a8ee75cb96e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, cond_resched(); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_pages); + balance_dirty_pages_ratelimited(inode->i_mapping); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28..5b3429ab8ec 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } defrag_count += ret; - balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + balance_dirty_pages_ratelimited(inode->i_mapping); mutex_unlock(&inode->i_mutex); if (newer_than) { -- cgit v1.2.3 From 0253f40ef9a709a1af39ce38b1d998af090f8127 Mon Sep 17 00:00:00 2001 From: "jeff.liu" Date: Sat, 27 Oct 2012 12:06:39 +0000 Subject: Btrfs: Remove the invalid shrink size check up from btrfs_shrink_dev() Remove an invalid size check up from btrfs_shrink_dev(). The new size should not larger than the device->total_bytes as it was already verified before coming to here(i.e. new_size < old_size). Remove invalid check up for btrfs_shrink_dev(). Signed-off-by: Jie Liu Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/volumes.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28..14c0d2e0790 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1409,7 +1409,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, btrfs_commit_transaction(trans, root); } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); - } + } /* equal, nothing need to do */ out_free: kfree(vol_args); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eaaf0bf5279..32a88428f6d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3059,9 +3059,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; - if (new_size >= device->total_bytes) - return -EINVAL; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v1.2.3 From d1423248734df6d9aff769abffd675dc034e0601 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 31 Oct 2012 15:16:32 +0000 Subject: Btrfs: Fix typo in fs/btrfs Correct spelling typo in btrfs. Signed-off-by: Masanari Iida Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/volumes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cad16566da3..2d41cb25266 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -413,7 +413,7 @@ struct btrfs_root_backup { __le64 bytes_used; __le64 num_devices; /* future */ - __le64 unsed_64[4]; + __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32a88428f6d..eeed97d19de 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4261,7 +4261,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_size); -- cgit v1.2.3 From 292fd7fc39aa06668f3a8db546714e727120cb3e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 30 Oct 2012 17:16:16 +0000 Subject: Btrfs: don't allow degraded mount if too many devices are missing The current behavior is to allow mounting or remounting a filesystem writeable in degraded mode if at least one writeable device is present. The next failed write access to a missing device which is above the tolerance of the configured level of redundancy results in an read-only enforcement. Even without this, the next time barrier_all_devices() is called and more devices are missing than tolerable, the switch to read-only mode takes place. In order to behave predictably and to provide proper feedback to the user at mount time, this patch compares the number of missing devices with the number of devices that are tolerated to be missing according to the configured RAID level. If more devices are missing than tolerated, e.g. if two devices are missing in case of RAID1, only a read-only mount and remount is allowed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 7 +++++++ fs/btrfs/super.c | 9 +++++++++ 2 files changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd70c2852ba..064315990f8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2508,6 +2508,13 @@ retry_root_backup: } fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable mount is not allowed\n"); + goto fail_block_groups; + } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 915ac14c206..acd2df85bed 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1226,6 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(*flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable remount is not allowed\n"); + ret = -EACCES; + goto restore; + } + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; -- cgit v1.2.3 From 183f37fa3503332740c76f1b493f4304ec889358 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:47 +0000 Subject: Btrfs: do not log extents when we only log new names When we log new names, we need to log just enough to recreate the inode during log replay, and there is no need to log extents along with it. This actually fixes a bug revealed by xfstests 241, where it shows that we're logging some extents that have not updated metadata, so we don't get proper EXTENT_DATA items to be copied to log tree. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 81e407d9677..4ec41ecb4d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3435,7 +3435,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { - fast_search = true; + if (inode_only == LOG_INODE_ALL) + fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, BTRFS_XATTR_ITEM_KEY); -- cgit v1.2.3 From 9f3959c53d57d010ae6f4205fbd0159cb7976a83 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:48 +0000 Subject: Btrfs: get right arguments for btrfs_wait_ordered_range btrfs_wait_ordered_range expects for 'len' instead of 'end'. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed8811..d2df98124d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1562,7 +1562,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end); + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* -- cgit v1.2.3 From 4fde183d8c755f8a8bdffcb03a8d947e62ccea6a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 1 Nov 2012 06:38:49 +0000 Subject: Btrfs: cleanup for btrfs_wait_order_range Variable 'found' is no more used. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index eecc20f14cf..f1073129704 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -653,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; - int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -689,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; - found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -702,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); -- cgit v1.2.3 From b7d5b0a819498a9c04e1d18201a42468f7edd92a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:32:18 +0000 Subject: Btrfs: fix joining the same transaction handler more than 2 times If we flush inodes with pending delalloc in a transaction, we may join the same transaction handler more than 2 times. The reason is: Task use_count of trans handle commit_transaction 1 |-> btrfs_start_delalloc_inodes 1 |-> run_delalloc_nocow 1 |-> join_transaction 2 |-> cow_file_range 2 |-> join_transaction 3 In fact, cow_file_range needn't join the transaction again because the caller have joined the transaction, so we fix this problem by this way. Reported-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 77 ++++++++++++++++++++++++++++++-------------------- fs/btrfs/transaction.c | 1 + 2 files changed, 48 insertions(+), 30 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dce9e218b84..96d20903bee 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -804,14 +804,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) +static noinline int __cow_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -824,25 +824,10 @@ static noinline int cow_file_range(struct inode *inode, int ret = 0; BUG_ON(btrfs_is_free_space_inode(inode)); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - return PTR_ERR(trans); - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; - ret = 0; /* if this is a small write inside eof, kick off defrag */ if (num_bytes < 64 * 1024 && @@ -953,11 +938,9 @@ static noinline int cow_file_range(struct inode *inode, alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } - ret = 0; out: - btrfs_end_transaction(trans, root); - return ret; + out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -972,6 +955,39 @@ out_unlock: goto out; } +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + ret = __cow_file_range(trans, inode, root, locked_page, start, end, + page_started, nr_written, unlock); + + btrfs_end_transaction(trans, root); + + return ret; +} + /* * work queue call back to started compression on a file and pages */ @@ -1282,9 +1298,9 @@ out_check: btrfs_release_path(path); if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, - found_key.offset - 1, page_started, - nr_written, 1); + ret = __cow_file_range(trans, inode, root, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1353,8 +1369,9 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = __cow_file_range(trans, inode, root, locked_page, + cow_start, end, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 259f74eabdb..44a5d73fddb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -312,6 +312,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); h = current->journal_info; h->use_count++; + WARN_ON(h->use_count > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; -- cgit v1.2.3 From ca46963718ef7368c84267c9f5e7394c3890442a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:14 +0000 Subject: Btrfs: fix missing flush when committing a transaction Consider the following case: Task1 Task2 start_transaction commit_transaction check pending snapshots list and the list is empty. add pending snapshot into list skip the delalloc flush end_transaction ... And then the problem that the snapshot is different with the source subvolume happen. This patch fixes the above problem by flush all pending stuffs when all the other tasks end the transaction. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 82 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 35 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 44a5d73fddb..bc1f5239733 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1399,6 +1399,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } +static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); + int snap_pending = 0; + int ret; + + if (!flush_on_commit) { + spin_lock(&root->fs_info->trans_lock); + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + spin_unlock(&root->fs_info->trans_lock); + } + + if (flush_on_commit || snap_pending) { + btrfs_start_delalloc_inodes(root, 1); + btrfs_wait_ordered_extents(root, 1); + } + + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; + + /* + * running the delayed items may have added new refs. account + * them now so that they hinder processing of more delayed refs + * as little as possible. + */ + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + + /* + * rename don't use btrfs_join_transaction, so, once we + * set the transaction to blocked above, we aren't going + * to get any new ordered operations. We can safely run + * it here and no for sure that nothing new will be added + * to the list + */ + btrfs_run_ordered_operations(root, 1); + + return 0; +} + /* * btrfs_transaction state sequence: * in_commit = 0, blocked = 0 (initial) @@ -1416,7 +1458,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); ret = btrfs_run_ordered_operations(root, 0); if (ret) { @@ -1495,47 +1536,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, should_grow = 1; do { - int snap_pending = 0; - joined = cur_trans->num_joined; - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - if (flush_on_commit || snap_pending) { - ret = btrfs_start_delalloc_inodes(root, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; - } - btrfs_wait_ordered_extents(root, 1); - } - - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(root, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; - } - prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -1548,6 +1556,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; + /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting -- cgit v1.2.3 From 315a9850da2b89c83971b26fe54a60f22bdd91ad Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:33:59 +0000 Subject: Btrfs: fix wrong file extent length There are two types of the file extent - inline extent and regular extent, When we log file extents, we didn't take inline extent into account, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/file-item.c | 21 ++++++++++++++++++++- fs/btrfs/tree-log.c | 10 ++-------- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2d41cb25266..f9a078661eb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3263,6 +3263,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); +u64 btrfs_file_extent_length(struct btrfs_path *path); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1ad08e4e4a1..bd38cef4235 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -133,7 +133,6 @@ fail: return ERR_PTR(ret); } - int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +u64 btrfs_file_extent_length(struct btrfs_path *path) +{ + int extent_type; + struct btrfs_file_extent_item *fi; + u64 len; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(path->nodes[0], fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) + len = btrfs_file_extent_num_bytes(path->nodes[0], fi); + else if (extent_type == BTRFS_FILE_EXTENT_INLINE) + len = btrfs_file_extent_inline_len(path->nodes[0], fi); + else + BUG(); + + return len; +} static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ec41ecb4d6..bcf0e48b193 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3143,7 +3143,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path, struct log_args *args) { struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 start = em->mod_start; u64 search_start = start; @@ -3199,10 +3198,7 @@ again: } } while (key.offset > start); - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], - fi); + num_bytes = btrfs_file_extent_length(path); if (key.offset + num_bytes <= start) { btrfs_release_path(path); return -ENOENT; @@ -3211,8 +3207,7 @@ again: args->src = path->nodes[0]; next_slot: btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - fi = btrfs_item_ptr(args->src, path->slots[0], - struct btrfs_file_extent_item); + num_bytes = btrfs_file_extent_length(path); if (args->nr && args->start_slot + args->nr == path->slots[0]) { args->nr++; @@ -3230,7 +3225,6 @@ next_slot: } nritems = btrfs_header_nritems(path->nodes[0]); path->slots[0]++; - num_bytes = btrfs_file_extent_num_bytes(args->src, fi); if (len < num_bytes) { /* I _think_ this is ok, envision we write to a * preallocated space that is adjacent to a previously -- cgit v1.2.3 From bbe1426764e5dfaa57e7b12cc954acdb3fb7f94b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:34:54 +0000 Subject: Btrfs: fix unprotected extent map operation when logging file extents We forget to protect the modified_extents list, fix it. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcf0e48b193..d1947af67bc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3526,8 +3526,10 @@ next_slot: struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; + write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) list_del_init(&em->list); + write_unlock(&tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From 5269b67e3d809dcaa4c6763a343423bb1b7b3fe6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 1 Nov 2012 07:35:23 +0000 Subject: Btrfs: fix missing log when BTRFS_INODE_NEEDS_FULL_SYNC is set If we set BTRFS_INODE_NEEDS_FULL_SYNC, we should log all the extent, but now we forget to take it into account, and set a wrong max key, if so, we will skip the file extent metadata when doing logging. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d1947af67bc..40b9efd20e4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3394,7 +3394,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* today the code can only do partial logging of directories */ - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || + (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags) && + inode_only == LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; -- cgit v1.2.3 From 31b1a2bd758f439fc945b3ac5899d890cb7e2dc6 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 10:58:34 +0000 Subject: fs/btrfs: use WARN Use WARN rather than printk followed by WARN_ON(1), for conciseness. A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression list es; @@ -printk( +WARN(1, es); -WARN_ON(1); // Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 19 +++++++------------ fs/btrfs/disk-io.c | 6 ++---- fs/btrfs/extent-tree.c | 7 +++---- fs/btrfs/extent_io.c | 9 +++------ fs/btrfs/inode.c | 3 +-- fs/btrfs/transaction.c | 12 ++++-------- fs/btrfs/volumes.c | 3 +-- 7 files changed, 21 insertions(+), 38 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 100c274a1cf..0e4adb00e9d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1359,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %llu running %llu\n", + if (trans->transaction != root->fs_info->running_transaction) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long) root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %llu running %llu\n", + + if (trans->transid != root->fs_info->generation) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -3640,11 +3637,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) { - printk(KERN_CRIT "push items %d nr %u\n", push_items, + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, right_nritems); - WARN_ON(1); - } if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 064315990f8..07a2162cdd6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3397,14 +3397,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) int was_dirty; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) { - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + if (transid != root->fs_info->generation) + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", (unsigned long long)buf->start, (unsigned long long)transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b495cb4b9b2..0bcb9543da6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6292,10 +6292,9 @@ use_block_rsv(struct btrfs_trans_handle *trans, static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) { - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); - WARN_ON(1); - } + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", + ret); ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 472873a94d9..3c062c8d1d7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", + if (end < start) + WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", (unsigned long long)end, (unsigned long long)start); - WARN_ON(1); - } state->start = start; state->end = end; @@ -4721,10 +4719,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } if (start + min_len > eb->len) { - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " "wanted %lu %lu\n", (unsigned long long)eb->start, eb->len, start, min_len); - WARN_ON(1); return -EINVAL; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 96d20903bee..6dca345fd1b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5458,8 +5458,7 @@ again: extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } else { - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); - WARN_ON(1); + WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc1f5239733..f21f39f0b1a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -145,16 +145,12 @@ loop: * the log must never go across transaction boundaries. */ smp_mb(); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " + if (!list_empty(&fs_info->tree_mod_seq_list)) + WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " "creating a fresh transaction\n"); - WARN_ON(1); - } - if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { - printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) + WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); - WARN_ON(1); - } atomic_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->commit_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eeed97d19de..3f4bfee66d7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3323,9 +3323,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, cur = cur->next; if (!device->writeable) { - printk(KERN_ERR + WARN(1, KERN_ERR "btrfs: read-only device in alloc_list\n"); - WARN_ON(1); continue; } -- cgit v1.2.3 From 6c1500f22a7be3a24ad3dffcdbf04be3f676521b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 3 Nov 2012 20:30:18 +0000 Subject: fs/btrfs: drop if around WARN_ON Just use WARN_ON rather than an if containing only WARN_ON(1). A simplified version of the semantic patch that makes this transformation is as follows: (http://coccinelle.lip6.fr/) // @@ expression e; @@ - if (e) WARN_ON(1); + WARN_ON(e); // Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 3 +-- fs/btrfs/ctree.c | 9 +++------ fs/btrfs/inode.c | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 208d8aa5b07..a3219523ebc 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -890,8 +890,7 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); list_del(&ref->list); - if (ref->count < 0) - WARN_ON(1); + WARN_ON(ref->count < 0); if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0e4adb00e9d..5c2cf992e71 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1464,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (cache_only && parent_level != 1) return 0; - if (trans->transaction != root->fs_info->running_transaction) - WARN_ON(1); - if (trans->transid != root->fs_info->generation) - WARN_ON(1); + WARN_ON(trans->transaction != root->fs_info->running_transaction); + WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); @@ -3398,8 +3396,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (push_items == 0) goto out_unlock; - if (!empty && push_items == left_nritems) - WARN_ON(1); + WARN_ON(!empty && push_items == left_nritems); /* push left to right */ right_nritems = btrfs_header_nritems(right); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dca345fd1b..e8733fab273 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1675,8 +1675,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) - WARN_ON(1); + WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, cached_state, GFP_NOFS); } -- cgit v1.2.3 From 37c4146d2208ba7e4463e8dd95a1bf9e3d865280 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 5 Nov 2012 12:42:08 +0000 Subject: Btrfs: fix a deadlock in aborting transaction due to ENOSPC When committing a transaction, we may bail out of running delayed refs due to ENOSPC, and then abort the current transaction to flip into readonly. But we'll hit a deadlock on ref head's lock since we forget to release its lock and other cleanup stuff. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0bcb9543da6..f8a358aee06 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2297,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, kfree(extent_op); if (ret) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; @@ -2339,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, count++; if (ret) { + if (locked_ref) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + } printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; -- cgit v1.2.3 From 109f2365f1928af241b2ccbd0f6ba0b93d911288 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 5 Nov 2012 12:42:09 +0000 Subject: Btrfs: fix a double free on pending snapshots in error handling When creating a snapshot, failing to commit a transaction can end up with aborting the transaction, following by doing a cleanup for it, where we'll free all snapshots pending to disk. So we check it and avoid double free on pending snapshots. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14c0d2e0790..e262cd8c4a7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -571,8 +571,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - if (ret) + if (ret) { + /* cleanup_transaction has freed this for us */ + if (trans->aborted) + pending_snapshot = NULL; goto fail; + } ret = pending_snapshot->error; if (ret) -- cgit v1.2.3 From d03f918ab9036cc71740c0aa796c8e02e6f6f6d3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 13:10:49 +0000 Subject: Btrfs: Don't trust the superblock label and simply printk("%s") it Someone who is root or capable(CAP_SYS_ADMIN) could corrupt the superblock and make Btrfs printk("%s") crash while holding the uuid_mutex since nobody forces a limit on the string. Since the uuid_mutex is significant, the system would be unusable afterwards. Signed-off-by: Stefan Behrens Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f4bfee66d7..db79fb7e7e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -764,10 +764,13 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "device label %s ", disk_super->label); - else + } else { printk(KERN_INFO "device fsid %pU ", disk_super->fsid); + } printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); -- cgit v1.2.3 From e1f5790e0588bc5b11eb57f95bfde8702049dd0d Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 8 Nov 2012 04:47:33 +0000 Subject: Btrfs: set hole punching time properly Even if the hole punching is executed, the modification time of the file is not updated. So, current time is set to inode. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d2df98124d0..883cf826cf2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1964,6 +1964,9 @@ out_trans: if (!trans) goto out_free; + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); nr = trans->blocks_used; -- cgit v1.2.3 From 3ef5969cd8a42a78ccdbc53f7abb2e6136b2ec65 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Thu, 8 Nov 2012 21:27:24 +0000 Subject: Btrfs: merge inode_list in __merge_refs When __merge_refs merges two refs, it is also needed to merge the inode_list of both refs. Otherwise we have missed backrefs and memory leaks. This happens for example if two inodes share an extent and both lie in the same leaf and thus also have the same parent. Signed-off-by: Alexander Block Reviewed-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a3219523ebc..04edf69be87 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode) pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; struct __prelim_ref *xchg; + struct extent_inode_elem *eie; ref2 = list_entry(pos2, struct __prelim_ref, list); @@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = ref2; ref2 = xchg; } - ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) continue; - ref1->count += ref2->count; } + + eie = ref1->inode_list; + while (eie && eie->next) + eie = eie->next; + if (eie) + eie->next = ref2->inode_list; + else + ref1->inode_list = ref2->inode_list; + ref1->count += ref2->count; + list_del(&ref2->list); kfree(ref2); } -- cgit v1.2.3 From b53d3f5db2b79637acadc06a330db6c2c60863f5 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:34:34 +0000 Subject: Btrfs: cleanup for btrfs_btree_balance_dirty - 'nr' is no more used. - btrfs_btree_balance_dirty() and __btrfs_btree_balance_dirty() can share a bunch of code. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 5 +---- fs/btrfs/disk-io.c | 29 ++++++++++------------------- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/file.c | 9 +++------ fs/btrfs/inode.c | 42 +++++++++++------------------------------- fs/btrfs/relocation.c | 22 ++++++---------------- fs/btrfs/transaction.c | 4 +--- 7 files changed, 34 insertions(+), 81 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0c6dca550ea..34836036f01 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1257,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1318,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) delayed_node); mutex_unlock(&delayed_node->mutex); - nr = trans->blocks_used; - trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); - __btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty_nodelay(root); free_path: btrfs_free_path(path); out: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07a2162cdd6..ff5d259ac27 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3411,7 +3411,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) } } -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, + int flush_delayed) { /* * looks as though older kernels can get into trouble with @@ -3423,7 +3424,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) if (current->flags & PF_MEMALLOC) return; - btrfs_balance_delayed_items(root); + if (flush_delayed) + btrfs_balance_delayed_items(root); num_dirty = root->fs_info->dirty_metadata_bytes; @@ -3434,25 +3436,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) return; } -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +void btrfs_btree_balance_dirty(struct btrfs_root *root) { - /* - * looks as though older kernels can get into trouble with - * this code, they end up stuck in balance_dirty_pages forever - */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (current->flags & PF_MEMALLOC) - return; - - num_dirty = root->fs_info->dirty_metadata_bytes; + __btrfs_btree_balance_dirty(root, 1); +} - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); - } - return; +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 0); } int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2025a9132c1..305c33efb0e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 883cf826cf2..bd7f1b01e05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1349,7 +1349,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, balance_dirty_pages_ratelimited_nr(inode->i_mapping, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); + btrfs_btree_balance_dirty(root); pos += copied; num_written += copied; @@ -1803,7 +1803,6 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; - unsigned long nr; int ret = 0; int err = 0; bool same_page = (offset >> PAGE_CACHE_SHIFT) == @@ -1931,9 +1930,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { @@ -1969,9 +1967,8 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out_free: btrfs_free_path(path); btrfs_free_block_rsv(root, rsv); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8733fab273..aabf747d056 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3091,7 +3091,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = dentry->d_inode; int ret; - unsigned long nr = 0; trans = __unlink_start_trans(dir, dentry); if (IS_ERR(trans)) @@ -3111,9 +3110,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - nr = trans->blocks_used; __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return ret; } @@ -3203,7 +3201,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - unsigned long nr = 0; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -3232,9 +3229,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - nr = trans->blocks_used; __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -3800,7 +3796,6 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - unsigned long nr; int ret; trace_btrfs_inode_evict(inode); @@ -3882,10 +3877,9 @@ void btrfs_evict_inode(struct inode *inode) ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } btrfs_free_block_rsv(root, rsv); @@ -3901,9 +3895,8 @@ void btrfs_evict_inode(struct inode *inode) root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); no_delete: clear_inode(inode); return; @@ -4915,7 +4908,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, int err; int drop_inode = 0; u64 objectid; - unsigned long nr = 0; u64 index = 0; if (!new_valid_dev(rdev)) @@ -4965,9 +4957,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); } out_unlock: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4983,7 +4974,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int drop_inode = 0; int err; - unsigned long nr = 0; u64 objectid; u64 index = 0; @@ -5033,13 +5023,12 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); } out_unlock: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -5050,7 +5039,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = old_dentry->d_inode; u64 index; - unsigned long nr = 0; int err; int drop_inode = 0; @@ -5094,14 +5082,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -5114,7 +5101,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int drop_on_err = 0; u64 objectid = 0; u64 index = 0; - unsigned long nr = 1; /* * 2 items for inode and ref @@ -5160,11 +5146,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) drop_on_err = 0; out_fail: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } @@ -6872,7 +6857,6 @@ static int btrfs_truncate(struct inode *inode) int ret; int err = 0; struct btrfs_trans_handle *trans; - unsigned long nr; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); @@ -6995,9 +6979,8 @@ static int btrfs_truncate(struct inode *inode) break; } - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { @@ -7031,9 +7014,8 @@ static int btrfs_truncate(struct inode *inode) if (ret && !err) err = ret; - nr = trans->blocks_used; ret = btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); } out: @@ -7594,7 +7576,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; - unsigned long nr = 0; name_len = strlen(symname) + 1; if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) @@ -7692,13 +7673,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, out_unlock: if (!err) d_instantiate(dentry, inode); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); return err; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 270f24ffe1b..300e09ac365 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root_item *root_item; struct btrfs_path *path; struct extent_buffer *leaf; - unsigned long nr; int level; int max_level; int replaced = 0; @@ -2126,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path->slots[level]); root_item->drop_level = level; - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -2156,10 +2154,9 @@ out: btrfs_update_reloc_root(trans, root); } - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -3262,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; - unsigned long nr; int ret = 0; if (inode) @@ -3296,9 +3292,8 @@ truncate: ret = btrfs_truncate_free_space_cache(root, trans, path, inode); btrfs_free_path(path); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); out: iput(inode); return ret; @@ -3715,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; - unsigned long nr; u64 flags; u32 item_size; int ret; @@ -3832,9 +3826,8 @@ restart: ret = btrfs_commit_transaction(trans, rc->extent_root); BUG_ON(ret); } else { - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_btree_balance_dirty(rc->extent_root); } trans = NULL; @@ -3864,9 +3857,8 @@ restart: GFP_NOFS); if (trans) { - nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); + btrfs_btree_balance_dirty(rc->extent_root); } if (!err) { @@ -3945,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - unsigned long nr; u64 objectid = BTRFS_FIRST_FREE_OBJECTID; int err = 0; @@ -3973,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = btrfs_orphan_add(trans, inode); out: - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty(root); if (err) { if (inode) iput(inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f21f39f0b1a..7b297354e73 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -952,7 +952,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - unsigned long nr; if (xchg(&root->defrag_running, 1)) return 0; @@ -964,9 +963,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) ret = btrfs_defrag_leaves(trans, root, cacheonly); - nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(info->tree_root, nr); + btrfs_btree_balance_dirty(info->tree_root); cond_resched(); if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) -- cgit v1.2.3 From d25628bdd66aedd6e07729d8dc6c8ee846d66d72 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 14 Nov 2012 14:35:30 +0000 Subject: Btrfs: protect devices list with its mutex Since we've kill the bigger one volume_mutex, we need to add devices list mutex back. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index db79fb7e7e9..92e586bc800 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1681,16 +1681,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) filemap_write_and_wait(bdev->bd_inode->i_mapping); devices = &root->fs_info->fs_devices->devices; - /* - * we have the volume lock, so we don't need the extra - * device list mutex while reading the list here. - */ + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); goto error; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) { -- cgit v1.2.3 From d9d181c1ba7aa09a6d2698e8c7e75b515524d504 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 09:58:09 +0100 Subject: Btrfs: rename the scrub context structure The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit is a first preparation step to adapt the scrub code to be shareable for the device replace procedure. The block device will be removed from the scrub context state structure in a later step. It used to be the source block device. The scrub code as it is used for the device replace procedure reads the source data from whereever it is optimal. The source device might even be gone (disconnected, for instance due to a hardware failure). Or the drive can be so faulty so that the device replace procedure tries to avoid access to the faulty source drive as much as possible, and only if all other mirrors are damaged, as a last resort, the source disk is accessed. The modified scrub code operates as if it would handle the source drive and thereby generates an exact copy of the source disk on the target disk, even if the source disk is not present at all. Therefore the block device pointer to the source disk is removed in a later patch, and therefore the context structure is renamed (this is the goal of the current patch) to reflect that no source block device scope is there anymore. Summary: This first preparation step consists of a textual substitution of the term "dev" to the term "ctx" whereever the scrub context is used. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 504 ++++++++++++++++++++++++++--------------------------- fs/btrfs/volumes.h | 2 +- 2 files changed, 253 insertions(+), 253 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f67e69..29c8aac5bda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -42,10 +42,10 @@ */ struct scrub_block; -struct scrub_dev; +struct scrub_ctx; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -66,7 +66,7 @@ struct scrub_page { struct scrub_bio { int index; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct bio *bio; int err; u64 logical; @@ -82,7 +82,7 @@ struct scrub_block { int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct { unsigned int header_error:1; unsigned int checksum_error:1; @@ -91,8 +91,8 @@ struct scrub_block { }; }; -struct scrub_dev { - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; +struct scrub_ctx { + struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; struct btrfs_device *dev; int first_free; int curr; @@ -116,7 +116,7 @@ struct scrub_dev { }; struct scrub_fixup_nodatasum { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; u64 logical; struct btrfs_root *root; struct btrfs_work work; @@ -138,7 +138,7 @@ struct scrub_warning { static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblock); @@ -163,9 +163,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, +static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); @@ -173,27 +173,27 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_free_csums(struct scrub_dev *sdev) +static void scrub_free_csums(struct scrub_ctx *sctx) { - while (!list_empty(&sdev->csum_list)) { + while (!list_empty(&sctx->csum_list)) { struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sdev->csum_list, + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); list_del(&sum->list); kfree(sum); } } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; - if (!sdev) + if (!sctx) return; /* this can happen when scrub is cancelled */ - if (sdev->curr != -1) { - struct scrub_bio *sbio = sdev->bios[sdev->curr]; + if (sctx->curr != -1) { + struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { BUG_ON(!sbio->pagev[i]); @@ -203,69 +203,69 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct scrub_bio *sbio = sdev->bios[i]; + for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) break; kfree(sbio); } - scrub_free_csums(sdev); - kfree(sdev); + scrub_free_csums(sctx); + kfree(sctx); } static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; int pages_per_bio; pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, bio_get_nr_vecs(dev->bdev)); - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); - if (!sdev) + sctx = kzalloc(sizeof(*sctx), GFP_NOFS); + if (!sctx) goto nomem; - sdev->dev = dev; - sdev->pages_per_bio = pages_per_bio; - sdev->curr = -1; - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { + sctx->dev = dev; + sctx->pages_per_bio = pages_per_bio; + sctx->curr = -1; + for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); if (!sbio) goto nomem; - sdev->bios[i] = sbio; + sctx->bios[i] = sbio; sbio->index = i; - sbio->sdev = sdev; + sbio->sctx = sctx; sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_DEV-1) - sdev->bios[i]->next_free = i + 1; + if (i != SCRUB_BIOS_PER_CTX - 1) + sctx->bios[i]->next_free = i + 1; else - sdev->bios[i]->next_free = -1; - } - sdev->first_free = 0; - sdev->nodesize = dev->dev_root->nodesize; - sdev->leafsize = dev->dev_root->leafsize; - sdev->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sdev->in_flight, 0); - atomic_set(&sdev->fixup_cnt, 0); - atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sdev->csum_list); - - spin_lock_init(&sdev->list_lock); - spin_lock_init(&sdev->stat_lock); - init_waitqueue_head(&sdev->list_wait); - return sdev; + sctx->bios[i]->next_free = -1; + } + sctx->first_free = 0; + sctx->nodesize = dev->dev_root->nodesize; + sctx->leafsize = dev->dev_root->leafsize; + sctx->sectorsize = dev->dev_root->sectorsize; + atomic_set(&sctx->in_flight, 0); + atomic_set(&sctx->fixup_cnt, 0); + atomic_set(&sctx->cancel_req, 0); + sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); + INIT_LIST_HEAD(&sctx->csum_list); + + spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); + init_waitqueue_head(&sctx->list_wait); + return sctx; nomem: - scrub_free_dev(sdev); + scrub_free_ctx(sctx); return ERR_PTR(-ENOMEM); } @@ -345,7 +345,7 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sdev->dev; + struct btrfs_device *dev = sblock->sctx->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -530,21 +530,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) { int ret; struct scrub_fixup_nodatasum *fixup; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_trans_handle *trans = NULL; struct btrfs_fs_info *fs_info; struct btrfs_path *path; int uncorrectable = 0; fixup = container_of(work, struct scrub_fixup_nodatasum, work); - sdev = fixup->sdev; + sctx = fixup->sctx; fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.malloc_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.malloc_errors; + spin_unlock(&sctx->stat_lock); uncorrectable = 1; goto out; } @@ -573,22 +573,22 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) } WARN_ON(ret != 1); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.corrected_errors; + spin_unlock(&sctx->stat_lock); out: if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, fixup->root); if (uncorrectable) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.uncorrectable_errors; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } btrfs_free_path(path); @@ -599,9 +599,9 @@ out: atomic_dec(&fs_info->scrubs_running); atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sdev->fixup_cnt); + atomic_dec(&sctx->fixup_cnt); wake_up(&fs_info->scrub_pause_wait); - wake_up(&sdev->list_wait); + wake_up(&sctx->list_wait); } /* @@ -614,7 +614,7 @@ out: */ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sblock_to_check->sdev; + struct scrub_ctx *sctx = sblock_to_check->sctx; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,7 +633,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sdev->dev->dev_root->fs_info; + fs_info = sctx->dev->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0].logical; generation = sblock_to_check->pagev[0].generation; @@ -677,25 +677,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, logical, sblocks_for_recheck); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -704,13 +704,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* build and submit the bios for the failed mirror, check checksums */ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sdev->csum_size); + csum, generation, sctx->csum_size); if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - btrfs_dev_stat_inc_and_print(sdev->dev, + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -725,45 +725,45 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * different bio (usually one of the two latter cases is * the cause) */ - spin_lock(&sdev->stat_lock); - sdev->stat.unverified_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.unverified_errors++; + spin_unlock(&sctx->stat_lock); goto out; } if (!sblock_bad->no_io_error_seen) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.csum_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.csum_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.verify_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.verify_errors++; + spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sdev->readonly) + if (sctx->readonly) goto did_not_correct_error; if (!is_metadata && !have_csum) { @@ -779,7 +779,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); if (!fixup_nodatasum) goto did_not_correct_error; - fixup_nodatasum->sdev = sdev; + fixup_nodatasum->sctx = sctx; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; @@ -796,7 +796,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sdev->fixup_cnt); + atomic_inc(&sctx->fixup_cnt); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -818,7 +818,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) ret = scrub_recheck_block(fs_info, sblocks_for_recheck + mirror_index, is_metadata, have_csum, csum, - generation, sdev->csum_size); + generation, sctx->csum_size); if (ret) goto did_not_correct_error; } @@ -930,7 +930,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, csum, - generation, sdev->csum_size); + generation, sctx->csum_size); if (!ret && !sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) @@ -939,23 +939,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto did_not_correct_error; } else { corrected_error: - spin_lock(&sdev->stat_lock); - sdev->stat.corrected_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.corrected_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } } else { did_not_correct_error: - spin_lock(&sdev->stat_lock); - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sdev->dev->name)); + rcu_str_deref(sctx->dev->name)); } out: @@ -978,7 +978,7 @@ out: return 0; } -static int scrub_setup_recheck_block(struct scrub_dev *sdev, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) @@ -988,7 +988,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, int ret; /* - * note: the three members sdev, ref_count and outstanding_pages + * note: the three members sctx, ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1028,9 +1028,9 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); if (!page->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); kfree(bbio); return -ENOMEM; } @@ -1259,14 +1259,14 @@ static void scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { - struct scrub_dev *sdev = sblock->sdev; + struct scrub_ctx *sctx = sblock->sctx; u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; u64 len; int index; @@ -1278,7 +1278,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) page = sblock->pagev[0].page; buffer = kmap_atomic(page); - len = sdev->sectorsize; + len = sctx->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); @@ -1296,7 +1296,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) } btrfs_csum_final(crc, csum); - if (memcmp(csum, on_disk_csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sctx->csum_size)) fail = 1; return fail; @@ -1304,9 +1304,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) static int scrub_checksum_tree_block(struct scrub_block *sblock) { - struct scrub_dev *sdev = sblock->sdev; + struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1324,7 +1324,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) page = sblock->pagev[0].page; mapped_buffer = kmap_atomic(page); h = (struct btrfs_header *)mapped_buffer; - memcpy(on_disk_csum, h->csum, sdev->csum_size); + memcpy(on_disk_csum, h->csum, sctx->csum_size); /* * we don't use the getter functions here, as we @@ -1345,8 +1345,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sdev->nodesize != sdev->leafsize); - len = sdev->nodesize - BTRFS_CSUM_SIZE; + BUG_ON(sctx->nodesize != sctx->leafsize); + len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; index = 0; @@ -1368,7 +1368,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) } btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++crc_fail; return fail || crc_fail; @@ -1377,8 +1377,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - struct scrub_dev *sdev = sblock->sdev; - struct btrfs_root *root = sdev->dev->dev_root; + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1396,7 +1396,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) page = sblock->pagev[0].page; mapped_buffer = kmap_atomic(page); s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sdev->csum_size); + memcpy(on_disk_csum, s->csum, sctx->csum_size); if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail_cor; @@ -1429,7 +1429,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) } btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -1438,14 +1438,14 @@ static int scrub_checksum_super(struct scrub_block *sblock) * They will get written with the next transaction commit * anyway */ - spin_lock(&sdev->stat_lock); - ++sdev->stat.super_errors; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sdev->dev, + btrfs_dev_stat_inc_and_print(sctx->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1469,21 +1469,21 @@ static void scrub_block_put(struct scrub_block *sblock) } } -static void scrub_submit(struct scrub_dev *sdev) +static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; - if (sdev->curr == -1) + if (sctx->curr == -1) return; - sbio = sdev->bios[sdev->curr]; - sdev->curr = -1; - atomic_inc(&sdev->in_flight); + sbio = sctx->bios[sctx->curr]; + sctx->curr = -1; + atomic_inc(&sctx->in_flight); btrfsic_submit_bio(READ, sbio->bio); } -static int scrub_add_page_to_bio(struct scrub_dev *sdev, +static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { struct scrub_block *sblock = spage->sblock; @@ -1494,20 +1494,20 @@ again: /* * grab a fresh bio or wait for one to become available */ - while (sdev->curr == -1) { - spin_lock(&sdev->list_lock); - sdev->curr = sdev->first_free; - if (sdev->curr != -1) { - sdev->first_free = sdev->bios[sdev->curr]->next_free; - sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->page_count = 0; - spin_unlock(&sdev->list_lock); + while (sctx->curr == -1) { + spin_lock(&sctx->list_lock); + sctx->curr = sctx->first_free; + if (sctx->curr != -1) { + sctx->first_free = sctx->bios[sctx->curr]->next_free; + sctx->bios[sctx->curr]->next_free = -1; + sctx->bios[sctx->curr]->page_count = 0; + spin_unlock(&sctx->list_lock); } else { - spin_unlock(&sdev->list_lock); - wait_event(sdev->list_wait, sdev->first_free != -1); + spin_unlock(&sctx->list_lock); + wait_event(sctx->list_wait, sctx->first_free != -1); } } - sbio = sdev->bios[sdev->curr]; + sbio = sctx->bios[sctx->curr]; if (sbio->page_count == 0) { struct bio *bio; @@ -1515,7 +1515,7 @@ again: sbio->logical = spage->logical; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1523,14 +1523,14 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; + bio->bi_bdev = sctx->dev->bdev; bio->bi_sector = spage->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != spage->logical) { - scrub_submit(sdev); + scrub_submit(sctx); goto again; } @@ -1542,20 +1542,20 @@ again: sbio->bio = NULL; return -EIO; } - scrub_submit(sdev); + scrub_submit(sctx); goto again; } scrub_block_get(sblock); /* one for the added page */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sdev->pages_per_bio) - scrub_submit(sdev); + if (sbio->page_count == sctx->pages_per_bio) + scrub_submit(sctx); return 0; } -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { @@ -1564,15 +1564,15 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, sblock = kzalloc(sizeof(*sblock), GFP_NOFS); if (!sblock) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); return -ENOMEM; } /* one ref inside this function, plus one for each page later on */ atomic_set(&sblock->ref_count, 1); - sblock->sdev = sdev; + sblock->sctx = sctx; sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { @@ -1582,9 +1582,9 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); spage->page = alloc_page(GFP_NOFS); if (!spage->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); while (index > 0) { index--; __free_page(sblock->pagev[index].page); @@ -1593,7 +1593,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->dev = sdev->dev; + spage->dev = sctx->dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; @@ -1601,7 +1601,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sdev->csum_size); + memcpy(spage->csum, csum, sctx->csum_size); } else { spage->have_csum = 0; } @@ -1616,7 +1616,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, struct scrub_page *spage = sblock->pagev + index; int ret; - ret = scrub_add_page_to_bio(sdev, spage); + ret = scrub_add_page_to_bio(sctx, spage); if (ret) { scrub_block_put(sblock); return ret; @@ -1624,7 +1624,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, } if (force) - scrub_submit(sdev); + scrub_submit(sctx); /* last one frees, either here or in bio completion for last page */ scrub_block_put(sblock); @@ -1634,8 +1634,8 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, static void scrub_bio_end_io(struct bio *bio, int err) { struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct scrub_ctx *sctx = sbio->sctx; + struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; sbio->err = err; sbio->bio = bio; @@ -1646,7 +1646,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) static void scrub_bio_end_io_worker(struct btrfs_work *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; + struct scrub_ctx *sctx = sbio->sctx; int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); @@ -1671,12 +1671,12 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) bio_put(sbio->bio); sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + spin_lock(&sctx->list_lock); + sbio->next_free = sctx->first_free; + sctx->first_free = sbio->index; + spin_unlock(&sctx->list_lock); + atomic_dec(&sctx->in_flight); + wake_up(&sctx->list_wait); } static void scrub_block_complete(struct scrub_block *sblock) @@ -1687,7 +1687,7 @@ static void scrub_block_complete(struct scrub_block *sblock) scrub_checksum(sblock); } -static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; @@ -1695,15 +1695,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, unsigned long i; unsigned long num_sectors; - while (!list_empty(&sdev->csum_list)) { - sum = list_first_entry(&sdev->csum_list, + while (!list_empty(&sctx->csum_list)) { + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); if (sum->bytenr > logical) return 0; if (sum->bytenr + sum->len > logical) break; - ++sdev->stat.csum_discards; + ++sctx->stat.csum_discards; list_del(&sum->list); kfree(sum); sum = NULL; @@ -1711,10 +1711,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sdev->sectorsize; + num_sectors = sum->len / sctx->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sdev->csum_size); + memcpy(csum, &sum->sums[i].sum, sctx->csum_size); ret = 1; break; } @@ -1727,7 +1727,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, } /* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; @@ -1735,20 +1735,20 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, u32 blocksize; if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sdev->sectorsize; - spin_lock(&sdev->stat_lock); - sdev->stat.data_extents_scrubbed++; - sdev->stat.data_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); + blocksize = sctx->sectorsize; + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed++; + sctx->stat.data_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sdev->nodesize != sdev->leafsize); - blocksize = sdev->nodesize; - spin_lock(&sdev->stat_lock); - sdev->stat.tree_extents_scrubbed++; - sdev->stat.tree_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); + BUG_ON(sctx->nodesize != sctx->leafsize); + blocksize = sctx->nodesize; + spin_lock(&sctx->stat_lock); + sctx->stat.tree_extents_scrubbed++; + sctx->stat.tree_bytes_scrubbed += len; + spin_unlock(&sctx->stat_lock); } else { - blocksize = sdev->sectorsize; + blocksize = sctx->sectorsize; BUG_ON(1); } @@ -1758,11 +1758,11 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (flags & BTRFS_EXTENT_FLAG_DATA) { /* push csums to sbio */ - have_csum = scrub_find_csum(sdev, logical, l, csum); + have_csum = scrub_find_csum(sctx, logical, l, csum); if (have_csum == 0) - ++sdev->stat.no_csum; + ++sctx->stat.no_csum; } - ret = scrub_pages(sdev, logical, l, physical, flags, gen, + ret = scrub_pages(sctx, logical, l, physical, flags, gen, mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; @@ -1773,11 +1773,11 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, return 0; } -static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, int num, u64 base, u64 length) { struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -1843,8 +1843,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, */ logical = base + offset; - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, + atomic_read(&sctx->in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); @@ -1898,7 +1898,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, * canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sdev->cancel_req)) { + atomic_read(&sctx->cancel_req)) { ret = -ECANCELED; goto out; } @@ -1907,9 +1907,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - scrub_submit(sdev); - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); + scrub_submit(sctx); + wait_event(sctx->list_wait, + atomic_read(&sctx->in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -1926,7 +1926,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, ret = btrfs_lookup_csums_range(csum_root, logical, logical + map->stripe_len - 1, - &sdev->csum_list, 1); + &sctx->csum_list, 1); if (ret) goto out; @@ -2004,7 +2004,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, key.objectid; } - ret = scrub_extent(sdev, key.objectid, key.offset, + ret = scrub_extent(sctx, key.objectid, key.offset, key.objectid - logical + physical, flags, generation, mirror_num); if (ret) @@ -2016,12 +2016,12 @@ next: btrfs_release_path(path); logical += increment; physical += map->stripe_len; - spin_lock(&sdev->stat_lock); - sdev->stat.last_physical = physical; - spin_unlock(&sdev->stat_lock); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = physical; + spin_unlock(&sctx->stat_lock); } /* push queued extents */ - scrub_submit(sdev); + scrub_submit(sctx); out: blk_finish_plug(&plug); @@ -2029,12 +2029,12 @@ out: return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, u64 dev_offset) { struct btrfs_mapping_tree *map_tree = - &sdev->dev->dev_root->fs_info->mapping_tree; + &sctx->dev->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -2055,9 +2055,9 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev && + if (map->stripes[i].dev == sctx->dev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sdev, map, i, chunk_offset, length); + ret = scrub_stripe(sctx, map, i, chunk_offset, length); if (ret) goto out; } @@ -2069,11 +2069,11 @@ out: } static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sdev->dev->dev_root; + struct btrfs_root *root = sctx->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; u64 chunk_tree; @@ -2094,7 +2094,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) path->search_commit_root = 1; path->skip_locking = 1; - key.objectid = sdev->dev->devid; + key.objectid = sctx->dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; @@ -2117,7 +2117,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != sdev->dev->devid) + if (found_key.objectid != sctx->dev->devid) break; if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,7 +2151,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) ret = -ENOENT; break; } - ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, + ret = scrub_chunk(sctx, chunk_tree, chunk_objectid, chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) @@ -2170,13 +2170,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) { int i; u64 bytenr; u64 gen; int ret; - struct btrfs_device *device = sdev->dev; + struct btrfs_device *device = sctx->dev; struct btrfs_root *root = device->dev_root; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -2189,12 +2189,12 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); return 0; } @@ -2238,7 +2238,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly) { - struct scrub_dev *sdev; + struct scrub_ctx *sctx; struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_device *dev; @@ -2302,41 +2302,41 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, scrub_workers_put(root); return -EINPROGRESS; } - sdev = scrub_setup_dev(dev); - if (IS_ERR(sdev)) { + sctx = scrub_setup_ctx(dev); + if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); scrub_workers_put(root); - return PTR_ERR(sdev); + return PTR_ERR(sctx); } - sdev->readonly = readonly; - dev->scrub_device = sdev; + sctx->readonly = readonly; + dev->scrub_device = sctx; atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sdev); + ret = scrub_supers(sctx); up_read(&fs_info->scrub_super_lock); if (!ret) - ret = scrub_enumerate_chunks(sdev, start, end); + ret = scrub_enumerate_chunks(sctx, start, end); - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); if (progress) - memcpy(progress, &sdev->stat, sizeof(*progress)); + memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_lock(&fs_info->scrub_lock); dev->scrub_device = NULL; mutex_unlock(&fs_info->scrub_lock); - scrub_free_dev(sdev); + scrub_free_ctx(sctx); scrub_workers_put(root); return ret; @@ -2407,15 +2407,15 @@ int btrfs_scrub_cancel(struct btrfs_root *root) int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = root->fs_info; - struct scrub_dev *sdev; + struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); - sdev = dev->scrub_device; - if (!sdev) { + sctx = dev->scrub_device; + if (!sctx) { mutex_unlock(&fs_info->scrub_lock); return -ENOTCONN; } - atomic_inc(&sdev->cancel_req); + atomic_inc(&sctx->cancel_req); while (dev->scrub_device) { mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait, @@ -2453,15 +2453,15 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress) { struct btrfs_device *dev; - struct scrub_dev *sdev = NULL; + struct scrub_ctx *sctx = NULL; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(root, devid, NULL, NULL); if (dev) - sdev = dev->scrub_device; - if (sdev) - memcpy(progress, &sdev->stat, sizeof(*progress)); + sctx = dev->scrub_device; + if (sctx) + memcpy(progress, &sctx->stat, sizeof(*progress)); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53c06af92e8..1789cda57ef 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -88,7 +88,7 @@ struct btrfs_device { u8 uuid[BTRFS_UUID_SIZE]; /* per-device scrub information */ - struct scrub_dev *scrub_device; + struct scrub_ctx *scrub_device; struct btrfs_work work; struct rcu_head rcu; -- cgit v1.2.3 From a36cf8b8933e4a7a7f2f2cbc3c70b097e97f7fd1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 13:26:57 +0100 Subject: Btrfs: remove the block device pointer from the scrub context struct The block device is removed from the scrub context state structure. The scrub code as it is used for the device replace procedure reads the source data from whereever it is optimal. The source device might even be gone (disconnected, for instance due to a hardware failure). Or the drive can be so faulty so that the device replace procedure tries to avoid access to the faulty source drive as much as possible, and only if all other mirrors are damaged, as a last resort, the source disk is accessed. The modified scrub code operates as if it would handle the source drive and thereby generates an exact copy of the source disk on the target disk, even if the source disk is not present at all. Therefore the block device pointer to the source disk is removed in the scrub context struct and moved into the lower level scope of scrub_bio, fixup and page structures where the block device context is known. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 133 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 60 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 29c8aac5bda..822c08a420c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -67,6 +67,7 @@ struct scrub_page { struct scrub_bio { int index; struct scrub_ctx *sctx; + struct btrfs_device *dev; struct bio *bio; int err; u64 logical; @@ -93,7 +94,7 @@ struct scrub_block { struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; - struct btrfs_device *dev; + struct btrfs_root *dev_root; int first_free; int curr; atomic_t in_flight; @@ -117,6 +118,7 @@ struct scrub_ctx { struct scrub_fixup_nodatasum { struct scrub_ctx *sctx; + struct btrfs_device *dev; u64 logical; struct btrfs_root *root; struct btrfs_work work; @@ -166,8 +168,8 @@ static void scrub_block_put(struct scrub_block *sblock); static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force); + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); @@ -228,9 +230,9 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; - sctx->dev = dev; sctx->pages_per_bio = pages_per_bio; sctx->curr = -1; + sctx->dev_root = dev->dev_root; for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { struct scrub_bio *sbio; @@ -345,8 +347,8 @@ err: static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sblock->sctx->dev; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_device *dev; + struct btrfs_fs_info *fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; @@ -361,15 +363,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) const int bufsize = 4096; int ret; + WARN_ON(sblock->page_count < 1); + dev = sblock->pagev[0].dev; + fs_info = sblock->sctx->dev_root->fs_info; + path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - BUG_ON(sblock->page_count < 1); swarn.sector = (sblock->pagev[0].physical) >> 9; swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; - swarn.dev = dev; + swarn.dev = NULL; swarn.msg_bufsize = bufsize; swarn.scratch_bufsize = bufsize; @@ -405,6 +410,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); } else { swarn.path = path; + swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, scrub_print_warning_inode, &swarn); @@ -588,7 +594,7 @@ out: printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); @@ -615,6 +621,7 @@ out: static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { struct scrub_ctx *sctx = sblock_to_check->sctx; + struct btrfs_device *dev; struct btrfs_fs_info *fs_info; u64 length; u64 logical; @@ -633,7 +640,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sctx->dev->dev_root->fs_info; + fs_info = sctx->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0].logical; generation = sblock_to_check->pagev[0].generation; @@ -643,6 +650,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_EXTENT_FLAG_DATA); have_csum = sblock_to_check->pagev[0].have_csum; csum = sblock_to_check->pagev[0].csum; + dev = sblock_to_check->pagev[0].dev; /* * read all mirrors one after the other. This includes to @@ -682,8 +690,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -695,8 +702,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -710,8 +716,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.read_errors++; sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -738,15 +743,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sctx->dev, - BTRFS_DEV_STAT_READ_ERRS); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { spin_lock(&sctx->stat_lock); @@ -756,10 +760,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_GENERATION_ERRS); else - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); } @@ -780,6 +784,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!fixup_nodatasum) goto did_not_correct_error; fixup_nodatasum->sctx = sctx; + fixup_nodatasum->dev = dev; fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; @@ -945,7 +950,7 @@ corrected_error: printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(dev->name)); } } else { did_not_correct_error: @@ -955,7 +960,7 @@ did_not_correct_error: printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", (unsigned long long)logical, - rcu_str_deref(sctx->dev->name)); + rcu_str_deref(dev->name)); } out: @@ -1266,7 +1271,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; u64 len; int index; @@ -1306,7 +1311,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1378,7 +1383,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1442,10 +1447,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sctx->dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1513,6 +1518,7 @@ again: sbio->physical = spage->physical; sbio->logical = spage->logical; + sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); @@ -1523,13 +1529,14 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sctx->dev->bdev; - bio->bi_sector = spage->physical >> 9; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_sector = sbio->physical >> 9; sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != - spage->logical) { + spage->logical || + sbio->dev != spage->dev) { scrub_submit(sctx); goto again; } @@ -1556,8 +1563,8 @@ again: } static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_block *sblock; int index; @@ -1593,7 +1600,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->dev = sctx->dev; + spage->dev = dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; @@ -1634,8 +1641,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void scrub_bio_end_io(struct bio *bio, int err) { struct scrub_bio *sbio = bio->bi_private; - struct scrub_ctx *sctx = sbio->sctx; - struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; sbio->err = err; sbio->bio = bio; @@ -1728,7 +1734,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num) + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -1762,7 +1769,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, flags, gen, + ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; @@ -1774,10 +1781,12 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, } static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, - struct map_lookup *map, int num, u64 base, u64 length) + struct map_lookup *map, + struct btrfs_device *scrub_dev, + int num, u64 base, u64 length) { struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sctx->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -1797,7 +1806,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct reada_control *reada2; struct btrfs_key key_start; struct btrfs_key key_end; - u64 increment = map->stripe_len; u64 offset; @@ -2006,7 +2014,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = scrub_extent(sctx, key.objectid, key.offset, key.objectid - logical + physical, - flags, generation, mirror_num); + scrub_dev, flags, generation, + mirror_num); if (ret) goto out; @@ -2030,11 +2039,13 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) + struct btrfs_device *scrub_dev, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = - &sctx->dev->dev_root->fs_info->mapping_tree; + &sctx->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -2055,9 +2066,10 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sctx->dev && + if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, map, i, chunk_offset, length); + ret = scrub_stripe(sctx, map, scrub_dev, i, + chunk_offset, length); if (ret) goto out; } @@ -2069,11 +2081,12 @@ out: } static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sctx->dev->dev_root; + struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; u64 chunk_tree; @@ -2094,11 +2107,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) path->search_commit_root = 1; path->skip_locking = 1; - key.objectid = sctx->dev->devid; + key.objectid = scrub_dev->devid; key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -2117,7 +2129,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != sctx->dev->devid) + if (found_key.objectid != scrub_dev->devid) break; if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,7 +2163,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) ret = -ENOENT; break; } - ret = scrub_chunk(sctx, chunk_tree, chunk_objectid, + ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) @@ -2170,14 +2182,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 start, u64 end) return ret < 0 ? ret : 0; } -static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) { int i; u64 bytenr; u64 gen; int ret; - struct btrfs_device *device = sctx->dev; - struct btrfs_root *root = device->dev_root; + struct btrfs_root *root = sctx->dev_root; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return -EIO; @@ -2186,11 +2198,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, 1); if (ret) return ret; } @@ -2317,11 +2330,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sctx); + ret = scrub_supers(sctx, dev); up_read(&fs_info->scrub_super_lock); if (!ret) - ret = scrub_enumerate_chunks(sctx, start, end); + ret = scrub_enumerate_chunks(sctx, dev, start, end); wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); atomic_dec(&fs_info->scrubs_running); -- cgit v1.2.3 From 7a9e9987681198c56ac7f165725ca322d7a196e1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 14:58:04 +0100 Subject: Btrfs: make the scrub page array dynamically allocated With the modified design (in order to support the devive replace procedure) it is necessary to alloc the page array dynamically. The reason is that pages are reused. At first a page is used for the bio to read the data from the filesystem, then the same page is reused for the bio that writes the data to the target disk. Since the read process and the write process are completely decoupled, this requires a new concept of refcounts and get/put functions for pages, and it requires to use newly created pages for each read bio which are freed after the write operation is finished. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 195 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 74 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 822c08a420c..15ac82ae577 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -46,6 +46,12 @@ struct scrub_ctx; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { @@ -56,6 +62,7 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + atomic_t ref_count; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -79,7 +86,7 @@ struct scrub_bio { }; struct scrub_block { - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; atomic_t ref_count; /* free mem on transition to zero */ @@ -165,6 +172,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); static int scrub_add_page_to_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -364,15 +373,15 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) int ret; WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0].dev; + dev = sblock->pagev[0]->dev; fs_info = sblock->sctx->dev_root->fs_info; path = btrfs_alloc_path(); swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sblock->pagev[0].physical) >> 9; - swarn.logical = sblock->pagev[0].logical; + swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; swarn.msg_bufsize = bufsize; @@ -642,15 +651,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BUG_ON(sblock_to_check->page_count < 1); fs_info = sctx->dev_root->fs_info; length = sblock_to_check->page_count * PAGE_SIZE; - logical = sblock_to_check->pagev[0].logical; - generation = sblock_to_check->pagev[0].generation; - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0].flags & + logical = sblock_to_check->pagev[0]->logical; + generation = sblock_to_check->pagev[0]->generation; + BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0].have_csum; - csum = sblock_to_check->pagev[0].csum; - dev = sblock_to_check->pagev[0].dev; + have_csum = sblock_to_check->pagev[0]->have_csum; + csum = sblock_to_check->pagev[0]->csum; + dev = sblock_to_check->pagev[0]->dev; /* * read all mirrors one after the other. This includes to @@ -892,7 +901,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; if (!page_bad->io_error) continue; @@ -903,8 +912,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index++) { struct scrub_block *sblock_other = sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = sblock_other->pagev + - page_num; + struct scrub_page *page_other = sblock_other->pagev[ + page_num]; if (!page_other->io_error) { ret = scrub_repair_page_from_good_copy( @@ -971,11 +980,11 @@ out: mirror_index; int page_index; - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; - page_index++) - if (sblock->pagev[page_index].page) - __free_page( - sblock->pagev[page_index].page); + for (page_index = 0; page_index < sblock->page_count; + page_index++) { + sblock->pagev[page_index]->sblock = NULL; + scrub_page_put(sblock->pagev[page_index]); + } } kfree(sblocks_for_recheck); } @@ -993,7 +1002,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, int ret; /* - * note: the three members sctx, ref_count and outstanding_pages + * note: the two members ref_count and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ @@ -1025,21 +1034,27 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, continue; sblock = sblocks_for_recheck + mirror_index; - page = sblock->pagev + page_index; - page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, dev->bdev is NULL */ - page->dev = bbio->stripes[mirror_index].dev; - page->mirror_num = mirror_index + 1; - page->page = alloc_page(GFP_NOFS); - if (!page->page) { + sblock->sctx = sctx; + page = kzalloc(sizeof(*page), GFP_NOFS); + if (!page) { +leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); kfree(bbio); return -ENOMEM; } + scrub_page_get(page); + sblock->pagev[page_index] = page; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; + page->mirror_num = mirror_index + 1; sblock->page_count++; + page->page = alloc_page(GFP_NOFS); + if (!page->page) + goto leave_nomem; } kfree(bbio); length -= sublen; @@ -1071,7 +1086,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; int ret; - struct scrub_page *page = sblock->pagev + page_num; + struct scrub_page *page = sblock->pagev[page_num]; DECLARE_COMPLETION_ONSTACK(complete); if (page->dev->bdev == NULL) { @@ -1080,7 +1095,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } - BUG_ON(!page->page); + WARN_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1125,14 +1140,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->extent_root; void *mapped_buffer; - BUG_ON(!sblock->pagev[0].page); + WARN_ON(!sblock->pagev[0]->page); if (is_metadata) { struct btrfs_header *h; - mapped_buffer = kmap_atomic(sblock->pagev[0].page); + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); h = (struct btrfs_header *)mapped_buffer; - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { @@ -1146,7 +1161,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, if (!have_csum) return; - mapped_buffer = kmap_atomic(sblock->pagev[0].page); + mapped_buffer = kmap_atomic(sblock->pagev[0]->page); } for (page_num = 0;;) { @@ -1162,9 +1177,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, page_num++; if (page_num >= sblock->page_count) break; - BUG_ON(!sblock->pagev[page_num].page); + WARN_ON(!sblock->pagev[page_num]->page); - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); + mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); } btrfs_csum_final(crc, calculated_csum); @@ -1202,11 +1217,11 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - struct scrub_page *page_good = sblock_good->pagev + page_num; + struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *page_good = sblock_good->pagev[page_num]; - BUG_ON(sblock_bad->pagev[page_num].page == NULL); - BUG_ON(sblock_good->pagev[page_num].page == NULL); + BUG_ON(page_bad->page == NULL); + BUG_ON(page_good->page == NULL); if (force_write || sblock_bad->header_error || sblock_bad->checksum_error || page_bad->io_error) { struct bio *bio; @@ -1247,8 +1262,8 @@ static void scrub_checksum(struct scrub_block *sblock) u64 flags; int ret; - BUG_ON(sblock->page_count < 1); - flags = sblock->pagev[0].flags; + WARN_ON(sblock->page_count < 1); + flags = sblock->pagev[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); @@ -1276,11 +1291,11 @@ static int scrub_checksum_data(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0].have_csum) + if (!sblock->pagev[0]->have_csum) return 0; - on_disk_csum = sblock->pagev[0].csum; - page = sblock->pagev[0].page; + on_disk_csum = sblock->pagev[0]->csum; + page = sblock->pagev[0]->page; buffer = kmap_atomic(page); len = sctx->sectorsize; @@ -1295,8 +1310,8 @@ static int scrub_checksum_data(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; buffer = kmap_atomic(page); } @@ -1326,7 +1341,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; + page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); h = (struct btrfs_header *)mapped_buffer; memcpy(on_disk_csum, h->csum, sctx->csum_size); @@ -1337,10 +1352,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * b) the page is already kmapped */ - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) ++fail; - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1365,8 +1380,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; mapped_buffer = kmap_atomic(page); mapped_size = PAGE_SIZE; p = mapped_buffer; @@ -1398,15 +1413,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; + page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) ++fail_cor; - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1426,8 +1441,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) break; index++; BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; + BUG_ON(!sblock->pagev[index]->page); + page = sblock->pagev[index]->page; mapped_buffer = kmap_atomic(page); mapped_size = PAGE_SIZE; p = mapped_buffer; @@ -1447,10 +1462,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev, + btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1468,12 +1483,25 @@ static void scrub_block_put(struct scrub_block *sblock) int i; for (i = 0; i < sblock->page_count; i++) - if (sblock->pagev[i].page) - __free_page(sblock->pagev[i].page); + scrub_page_put(sblock->pagev[i]); kfree(sblock); } } +static void scrub_page_get(struct scrub_page *spage) +{ + atomic_inc(&spage->ref_count); +} + +static void scrub_page_put(struct scrub_page *spage) +{ + if (atomic_dec_and_test(&spage->ref_count)) { + if (spage->page) + __free_page(spage->page); + kfree(spage); + } +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1577,28 +1605,28 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, return -ENOMEM; } - /* one ref inside this function, plus one for each page later on */ + /* one ref inside this function, plus one for each page added to + * a bio later on */ atomic_set(&sblock->ref_count, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { - struct scrub_page *spage = sblock->pagev + index; + struct scrub_page *spage; u64 l = min_t(u64, len, PAGE_SIZE); - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) { + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - while (index > 0) { - index--; - __free_page(sblock->pagev[index].page); - } - kfree(sblock); + scrub_block_put(sblock); return -ENOMEM; } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + scrub_page_get(spage); + sblock->pagev[index] = spage; spage->sblock = sblock; spage->dev = dev; spage->flags = flags; @@ -1613,14 +1641,17 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, spage->have_csum = 0; } sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; len -= l; logical += l; physical += l; } - BUG_ON(sblock->page_count == 0); + WARN_ON(sblock->page_count == 0); for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev + index; + struct scrub_page *spage = sblock->pagev[index]; int ret; ret = scrub_add_page_to_bio(sctx, spage); @@ -2289,6 +2320,22 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return -EINVAL; } + if (fs_info->chunk_root->nodesize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || + fs_info->chunk_root->sectorsize > + PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + /* + * would exhaust the array bounds of pagev member in + * struct scrub_block + */ + pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", + fs_info->chunk_root->nodesize, + SCRUB_MAX_PAGES_PER_BLOCK, + fs_info->chunk_root->sectorsize, + SCRUB_MAX_PAGES_PER_BLOCK); + return -EINVAL; + } + ret = scrub_workers_get(root); if (ret) return ret; -- cgit v1.2.3 From cb2ced73d8c7a38b5f699e267deadf2a2cfe911c Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:14:21 +0100 Subject: Btrfs: in scrub repair code, optimize the reading of mirrors In case that disk blocks need to be repaired (rewritten), the current code at first (for simplicity reasons) reads all alternate mirrors in the first step, afterwards selects the best one in a second step. This is now changed to read one alternate mirror after the other and to leave the loop early when a perfect mirror is found. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 15ac82ae577..7d38f407324 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -819,26 +819,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * now build and submit the bios for the other mirrors, check - * checksums - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (mirror_index == failed_mirror_index) - continue; - - /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, - sblocks_for_recheck + mirror_index, - is_metadata, have_csum, csum, - generation, sctx->csum_size); - if (ret) - goto did_not_correct_error; - } - - /* - * first try to pick the mirror which is completely without I/O + * checksums. + * First try to pick the mirror which is completely without I/O * errors and also does not have a checksum error. * If one is found, and if a checksum is present, the full block * that is known to contain an error is rewritten. Afterwards @@ -854,10 +836,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) mirror_index < BTRFS_MAX_MIRRORS && sblocks_for_recheck[mirror_index].page_count > 0; mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; + struct scrub_block *sblock_other; - if (!sblock_other->header_error && + if (mirror_index == failed_mirror_index) + continue; + sblock_other = sblocks_for_recheck + mirror_index; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + if (!ret && !sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { int force_write = is_metadata || have_csum; -- cgit v1.2.3 From 34f5c8e90b3f002672cd6b4e6e7c5b959fd981ae Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:16:26 +0100 Subject: Btrfs: in scrub repair code, simplify alloc error handling In the scrub repair code, the code is changed to handle memory allocation errors a little bit smarter. The change is to handle it just like a read error. This simplifies the code and removes a couple of lines of code, since the code to handle read errors is there anyway. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 61 ++++++++++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 35 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7d38f407324..fcd5bccaa4e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -151,10 +151,10 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, u64 length, u64 logical, struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -718,16 +718,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_bad = sblocks_for_recheck + failed_mirror_index; /* build and submit the bios for the failed mirror, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sctx->csum_size); - if (ret) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } + scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sctx->csum_size); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -843,10 +835,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other = sblocks_for_recheck + mirror_index; /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_other, is_metadata, - have_csum, csum, generation, - sctx->csum_size); - if (!ret && !sblock_other->header_error && + scrub_recheck_block(fs_info, sblock_other, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + + if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { int force_write = is_metadata || have_csum; @@ -931,10 +924,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * is verified, but most likely the data comes out * of the page cache. */ - ret = scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sctx->csum_size); - if (!ret && !sblock_bad->header_error && + scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sctx->csum_size); + if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) goto corrected_error; @@ -1061,10 +1054,10 @@ leave_nomem: * to take those pages that are not errored from all the mirrors so that * the pages that are errored in the just handled mirror can be repaired. */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size) +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { int page_num; @@ -1074,7 +1067,6 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; - int ret; struct scrub_page *page = sblock->pagev[page_num]; DECLARE_COMPLETION_ONSTACK(complete); @@ -1086,18 +1078,17 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, WARN_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; + if (!bio) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; - } + bio_add_page(bio, page->page, PAGE_SIZE, 0); btrfsic_submit_bio(READ, bio); /* this will also unplug the queue */ @@ -1114,7 +1105,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, have_csum, csum, generation, csum_size); - return 0; + return; } static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From b6bfebc13218f1fc1502041a810919d3a81b8b4e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 2 Nov 2012 16:44:58 +0100 Subject: Btrfs: cleanup scrub bio and worker wait code Just move some code into functions to make everything more readable. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 106 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 35 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fcd5bccaa4e..a67b1a17a00 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 STRATO. All rights reserved. + * Copyright (C) 2011, 2012 STRATO. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -104,8 +104,8 @@ struct scrub_ctx { struct btrfs_root *dev_root; int first_free; int curr; - atomic_t in_flight; - atomic_t fixup_cnt; + atomic_t bios_in_flight; + atomic_t workers_pending; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -146,6 +146,10 @@ struct scrub_warning { }; +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_mapping_tree *map_tree, @@ -184,6 +188,59 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ + atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ + atomic_dec(&sctx->bios_in_flight); + wake_up(&sctx->list_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * increment scrubs_running to prevent cancel requests from + * completing as long as a worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. + */ + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sctx->workers_pending); +} + +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + /* + * see scrub_pending_trans_workers_inc() why we're pretending + * to be paused in the scrub counters + */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sctx->workers_pending); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sctx->list_wait); +} + static void scrub_free_csums(struct scrub_ctx *sctx) { while (!list_empty(&sctx->csum_list)) { @@ -264,8 +321,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx->nodesize = dev->dev_root->nodesize; sctx->leafsize = dev->dev_root->leafsize; sctx->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sctx->in_flight, 0); - atomic_set(&sctx->fixup_cnt, 0); + atomic_set(&sctx->bios_in_flight, 0); + atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sctx->csum_list); @@ -609,14 +666,7 @@ out: btrfs_free_path(path); kfree(fixup); - /* see caller why we're pretending to be paused in the scrub counters */ - mutex_lock(&fs_info->scrub_lock); - atomic_dec(&fs_info->scrubs_running); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sctx->fixup_cnt); - wake_up(&fs_info->scrub_pause_wait); - wake_up(&sctx->list_wait); + scrub_pending_trans_workers_dec(sctx); } /* @@ -789,20 +839,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fixup_nodatasum->logical = logical; fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; - /* - * increment scrubs_running to prevent cancel requests from - * completing as long as a fixup worker is running. we must also - * increment scrubs_paused to prevent deadlocking on pause - * requests used for transactions commits (as the worker uses a - * transaction context). it is safe to regard the fixup worker - * as paused for all matters practical. effectively, we only - * avoid cancellation requests from completing. - */ - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrubs_running); - atomic_inc(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sctx->fixup_cnt); + scrub_pending_trans_workers_inc(sctx); fixup_nodatasum->work.func = scrub_fixup_nodatasum; btrfs_queue_worker(&fs_info->scrub_workers, &fixup_nodatasum->work); @@ -1491,7 +1528,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; - atomic_inc(&sctx->in_flight); + scrub_pending_bio_inc(sctx); btrfsic_submit_bio(READ, sbio->bio); } @@ -1692,8 +1729,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sbio->next_free = sctx->first_free; sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); - atomic_dec(&sctx->in_flight); - wake_up(&sctx->list_wait); + scrub_pending_bio_dec(sctx); } static void scrub_block_complete(struct scrub_block *sblock) @@ -1863,7 +1899,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, logical = base + offset; wait_event(sctx->list_wait, - atomic_read(&sctx->in_flight) == 0); + atomic_read(&sctx->bios_in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); @@ -1928,7 +1964,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* push queued extents */ scrub_submit(sctx); wait_event(sctx->list_wait, - atomic_read(&sctx->in_flight) == 0); + atomic_read(&sctx->bios_in_flight) == 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -2218,7 +2254,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (ret) return ret; } - wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); return 0; } @@ -2363,11 +2399,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); - wait_event(sctx->list_wait, atomic_read(&sctx->in_flight) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sctx->list_wait, atomic_read(&sctx->fixup_cnt) == 0); + wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); -- cgit v1.2.3 From beaf8ab3afef27ed81255d9808b67f6d390ca06f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 12 Nov 2012 14:03:45 +0100 Subject: Btrfs: move some common code into a subfunction Some code to open block devices, to read the superblock and to handle errors was repeated multiple times in 3 places, and the following patch makes use of it as well. This code is now moved into a subfunction. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 93 +++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 43 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 92e586bc800..4def1fdbf75 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,6 +108,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, + int flush, struct block_device **bdev, + struct buffer_head **bh) +{ + int ret; + + *bdev = blkdev_get_by_path(device_path, flags, holder); + + if (IS_ERR(*bdev)) { + ret = PTR_ERR(*bdev); + printk(KERN_INFO "btrfs: open %s failed\n", device_path); + goto error; + } + + if (flush) + filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + ret = set_blocksize(*bdev, 4096); + if (ret) { + blkdev_put(*bdev, flags); + goto error; + } + invalidate_bdev(*bdev); + *bh = btrfs_read_dev_super(*bdev); + if (!*bh) { + ret = -EINVAL; + blkdev_put(*bdev, flags); + goto error; + } + + return 0; + +error: + *bdev = NULL; + *bh = NULL; + return ret; +} + static void requeue_list(struct btrfs_pending_bios *pending_bios, struct bio *head, struct bio *tail) { @@ -637,18 +675,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - bdev = blkdev_get_by_path(device->name->str, flags, holder); - if (IS_ERR(bdev)) { - printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); - goto error; - } - filemap_write_and_wait(bdev->bd_inode->i_mapping); - invalidate_bdev(bdev); - set_blocksize(bdev, 4096); - - bh = btrfs_read_dev_super(bdev); - if (!bh) - goto error_close; + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + continue; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -697,9 +727,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); -error_close: blkdev_put(bdev, flags); -error: continue; } if (fs_devices->open_devices == 0) { @@ -744,22 +772,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 total_devices; flags |= FMODE_EXCL; - bdev = blkdev_get_by_path(path, flags, holder); - - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } - mutex_lock(&uuid_mutex); - ret = set_blocksize(bdev, 4096); + ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); if (ret) - goto error_close; - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } + goto error; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); @@ -777,10 +793,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; brelse(bh); -error_close: - mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: + mutex_unlock(&uuid_mutex); return ret; } @@ -1374,20 +1389,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, - root->fs_info->bdev_holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); + ret = btrfs_get_bdev_and_sb(device_path, + FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder, 0, + &bdev, &bh); + if (ret) goto out; - } - - set_blocksize(bdev, 4096); - invalidate_bdev(bdev); - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; -- cgit v1.2.3 From 7ba15b7d211846c187a7c5dc75a5964476f8bc89 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 14:42:30 +0100 Subject: Btrfs: add two more find_device() methods The new function btrfs_find_device_missing_or_by_path() will be used for the device replace procedure. This function itself calls the second new function btrfs_find_device_by_path(). Unfortunately, it is not possible to currently make the rest of the code use these functions as well, since all functions that look similar at first view are all a little bit different in what they are doing. But in the future, new code could benefit from these two new functions, and currently, device replace uses them. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 5 +++++ 2 files changed, 64 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4def1fdbf75..1483041eb86 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1522,6 +1522,65 @@ error_undo: goto error_brelse; } +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) +{ + int ret = 0; + struct btrfs_super_block *disk_super; + u64 devid; + u8 *dev_uuid; + struct block_device *bdev; + struct buffer_head *bh; + + *device = NULL; + ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, + root->fs_info->bdev_holder, 0, &bdev, &bh); + if (ret) + return ret; + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_uuid = disk_super->dev_item.uuid; + *device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + brelse(bh); + if (!*device) + ret = -ENOENT; + blkdev_put(bdev, FMODE_READ); + return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device) +{ + *device = NULL; + if (strcmp(device_path, "missing") == 0) { + struct list_head *devices; + struct btrfs_device *tmp; + + devices = &root->fs_info->fs_devices->devices; + /* + * It is safe to read the devices since the volume_mutex + * is held by the caller. + */ + list_for_each_entry(tmp, devices, dev_list) { + if (tmp->in_fs_metadata && !tmp->bdev) { + *device = tmp; + break; + } + } + + if (!*device) { + pr_err("btrfs: no missing device found\n"); + return -ENOENT; + } + + return 0; + } else { + return btrfs_find_device_by_path(root, device_path, device); + } +} + /* * does all the dirty work required for changing file system's UUID. */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1789cda57ef..657bb12b306 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,6 +268,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, + char *device_path, + struct btrfs_device **device); +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); -- cgit v1.2.3 From 5d9640517d92d05843711ea982cbeff42d7ed32d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 14:59:07 +0100 Subject: Btrfs: Pass fs_info to btrfs_num_copies() instead of mapping_tree This is required for the device replace procedure in a later step. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 12 ++++++------ fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 11 +++++------ fs/btrfs/volumes.c | 3 ++- fs/btrfs/volumes.h | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5a3e45db642..58dfac1359a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block( *next_blockp = NULL; if (0 == *num_copiesp) { *num_copiesp = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data( chunk_len = num_bytes; num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2463,7 +2463,7 @@ static int btrfsic_process_written_superblock( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2960,7 +2960,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, struct btrfsic_block_data_ctx block_ctx; int match = 0; - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(state->root->fs_info, bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ff5d259ac27..ba2b931fd8f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -387,7 +387,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(root->fs_info, eb->start, eb->len); if (num_copies == 1) break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3c062c8d1d7..e0b7138909f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2044,10 +2044,10 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - num_copies = btrfs_num_copies(map_tree, failrec->logical, - failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies > 1) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; ret = repair_io_failure(map_tree, start, failrec->len, failrec->logical, page, failrec->failed_mirror); @@ -2157,9 +2157,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * clean_io_failure() clean all those errors at once. */ } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies == 1) { /* * we only have a single copy of the data, so don't bother with diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1483041eb86..5612767b910 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3785,8 +3785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) } } -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct extent_map_tree *em_tree = &map_tree->map_tree; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 657bb12b306..35ea4424963 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, -- cgit v1.2.3 From 3ec706c831d4c96905c287013c8228b21619a1d9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 15:46:42 +0100 Subject: Btrfs: pass fs_info to btrfs_map_block() instead of mapping_tree This is required for the device replace procedure in a later step. Two calling functions also had to be changed to have the fs_info pointer: repair_io_failure() and scrub_setup_recheck_block(). Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 19 +++++++++---------- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/inode.c | 12 +++++------- fs/btrfs/reada.c | 3 +-- fs/btrfs/scrub.c | 14 +++++++------- fs/btrfs/volumes.c | 11 +++++------ fs/btrfs/volumes.h | 2 +- 9 files changed, 32 insertions(+), 37 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 58dfac1359a..8f9abedae2c 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1582,7 +1582,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_device *device; length = len; - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); device = multi->stripes[0].dev; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f8a358aee06..b4d438f6c2b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1818,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e0b7138909f..62ec6e45f70 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1917,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err) * the standard behavior is to write all copies in a raid setup. here we only * want to write the one bad copy. so we do the mapping for ourselves and issue * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which + * to avoid any synchronization issues, wait for the data after writing, which * actually prevents the read that triggered the error from finishing. * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num) { @@ -1944,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, bio->bi_size = 0; map_length = length; - ret = btrfs_map_block(map_tree, WRITE, logical, + ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { bio_put(bio); @@ -1982,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, start, p, mirror_num); if (ret) break; @@ -2008,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; struct extent_state *state; int num_copies; int did_repair = 0; @@ -2044,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, - failrec->logical, failrec->len); + fs_info = BTRFS_I(inode)->root->fs_info; + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); if (num_copies > 1) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, start, failrec->len, + ret = repair_io_failure(fs_info, start, failrec->len, failrec->logical, page, failrec->failed_mirror); did_repair = !ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 711d12b8002..2eacfabd326 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -337,9 +337,9 @@ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); -struct btrfs_mapping_tree; +struct btrfs_fs_info; -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aabf747d056..5d1675a8c9e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1549,7 +1549,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_mapping_tree *map_tree; u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; @@ -1559,11 +1558,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, return 0; length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, READ, logical, + ret = btrfs_map_block(root->fs_info, READ, logical, &map_length, NULL, 0); - /* Will always return 0 or 1 with map_multi == NULL */ + /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); if (map_length < length + size) return 1; @@ -6364,7 +6362,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; struct bio *bio; struct bio *orig_bio = dip->orig_bio; struct bio_vec *bvec = orig_bio->bi_io_vec; @@ -6377,7 +6374,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); @@ -6431,7 +6428,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, READ, + start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(bio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a955669519a..0ddc5659f94 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -323,7 +323,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_extent *re = NULL; struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; @@ -358,7 +357,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_WRITE, logical, &length, &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a67b1a17a00..894bb2732fc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -152,7 +152,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_mapping_tree *map_tree, + struct btrfs_fs_info *fs_info, u64 length, u64 logical, struct scrub_block *sblock); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, @@ -523,7 +523,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) } if (PageUptodate(page)) { - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -544,8 +544,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) ret = -EIO; goto out; } - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fs_info = BTRFS_I(inode)->root->fs_info; + ret = repair_io_failure(fs_info, offset, PAGE_SIZE, fixup->logical, page, fixup->mirror_num); unlock_page(page); @@ -754,7 +754,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, &fs_info->mapping_tree, length, + ret = scrub_setup_recheck_block(sctx, fs_info, length, logical, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -1012,7 +1012,7 @@ out: } static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_mapping_tree *map_tree, + struct btrfs_fs_info *fs_info, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -1036,7 +1036,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5612767b910..96bb2e4446a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3826,13 +3826,14 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, return optimal; } -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; @@ -4061,11 +4062,11 @@ out: return ret; } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, mirror_num); } @@ -4394,7 +4395,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, int mirror_num, int async_submit) { - struct btrfs_mapping_tree *map_tree; struct btrfs_device *dev; struct bio *first_bio = bio; u64 logical = (u64)bio->bi_sector << 9; @@ -4406,10 +4406,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, struct btrfs_bio *bbio = NULL; length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num); if (ret) /* -ENOMEM */ return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 35ea4424963..ad5566d4f2c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -248,7 +248,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 start, u64 num_bytes); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, -- cgit v1.2.3 From a8a6dab77997a371f1925a4001021eea3ee5cb88 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 15:50:14 +0100 Subject: Btrfs: add btrfs_scratch_superblock() function This new function is used by the device replace procedure in a later patch. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 18 ++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 19 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96bb2e4446a..6cd8a32c448 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5106,3 +5106,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root, stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; return 0; } + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + + bh = btrfs_read_dev_super(device->bdev); + if (!bh) + return -EINVAL; + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ad5566d4f2c..7eaaf4e6195 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -301,6 +301,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int btrfs_scratch_superblock(struct btrfs_device *device); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) -- cgit v1.2.3 From aa1b8cd409f05e1489ec77ff219eff6ed4b801b8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:03:39 +0100 Subject: Btrfs: pass fs_info instead of root A small number of functions that are used in a device replace procedure when the operation is resumed at mount time are unable to pass the same root pointer that would be used in the regular (ioctl) context. And since the root pointer is not required, only the fs_info is, the root pointer argument is replaced with the fs_info pointer argument. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 11 ++++---- fs/btrfs/disk-io.c | 4 +-- fs/btrfs/ioctl.c | 8 +++--- fs/btrfs/scrub.c | 76 ++++++++++++++++++++++++------------------------------ fs/btrfs/super.c | 2 +- fs/btrfs/volumes.c | 23 +++++++++-------- fs/btrfs/volumes.h | 2 +- 7 files changed, 60 insertions(+), 66 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9a078661eb..f8bb62c82b0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3540,15 +3540,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly); +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); void btrfs_scrub_continue_super(struct btrfs_root *root); -int __btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel(struct btrfs_root *root); -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, + struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ba2b931fd8f..42a8024e935 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3283,9 +3283,9 @@ int close_ctree(struct btrfs_root *root) smp_mb(); /* pause restriper - we want to resume on mount */ - btrfs_pause_balance(root->fs_info); + btrfs_pause_balance(fs_info); - btrfs_scrub_cancel(root); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e262cd8c4a7..b40b827f93e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1343,7 +1343,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } - device = btrfs_find_device(root, devid, NULL, NULL); + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); @@ -2332,7 +2332,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) s_uuid = di_args->uuid; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); + dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -3089,7 +3089,7 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); - ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); if (copy_to_user(arg, sa, sizeof(*sa))) @@ -3104,7 +3104,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_scrub_cancel(root); + return btrfs_scrub_cancel(root->fs_info); } static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 894bb2732fc..6cf23f4f7bb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2262,9 +2262,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; mutex_lock(&fs_info->scrub_lock); @@ -2283,10 +2282,8 @@ out: return ret; } -static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) btrfs_stop_workers(&fs_info->scrub_workers); @@ -2294,29 +2291,29 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) mutex_unlock(&fs_info->scrub_lock); } - -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly) +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly) { struct scrub_ctx *sctx; - struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_device *dev; - if (btrfs_fs_closing(root->fs_info)) + if (btrfs_fs_closing(fs_info)) return -EINVAL; /* * check some assumptions */ - if (root->nodesize != root->leafsize) { + if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { printk(KERN_ERR "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", - root->nodesize, root->leafsize); + fs_info->chunk_root->nodesize, + fs_info->chunk_root->leafsize); return -EINVAL; } - if (root->nodesize > BTRFS_STRIPE_LEN) { + if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum * the way scrub is implemented. Do not handle this @@ -2324,15 +2321,16 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, */ printk(KERN_ERR "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", - root->nodesize, BTRFS_STRIPE_LEN); + fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); return -EINVAL; } - if (root->sectorsize != PAGE_SIZE) { + if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ printk(KERN_ERR "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - root->sectorsize, (unsigned long long)PAGE_SIZE); + fs_info->chunk_root->sectorsize, + (unsigned long long)PAGE_SIZE); return -EINVAL; } @@ -2352,37 +2350,37 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return -EINVAL; } - ret = scrub_workers_get(root); + ret = scrub_workers_get(fs_info); if (ret) return ret; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || dev->missing) { - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return -ENODEV; } mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return -ENODEV; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); + return -EIO; } if (dev->scrub_device) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return -EINPROGRESS; } sctx = scrub_setup_ctx(dev); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + scrub_workers_put(fs_info); return PTR_ERR(sctx); } sctx->readonly = readonly; @@ -2390,7 +2388,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); down_read(&fs_info->scrub_super_lock); ret = scrub_supers(sctx, dev); @@ -2413,7 +2411,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, mutex_unlock(&fs_info->scrub_lock); scrub_free_ctx(sctx); - scrub_workers_put(root); + scrub_workers_put(fs_info); return ret; } @@ -2453,9 +2451,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root) up_write(&root->fs_info->scrub_super_lock); } -int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { - mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { mutex_unlock(&fs_info->scrub_lock); @@ -2475,14 +2472,9 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_scrub_cancel(struct btrfs_root *root) +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) { - return __btrfs_scrub_cancel(root->fs_info); -} - -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = root->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); @@ -2514,12 +2506,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) * does not go away in cancel_dev. FIXME: find a better solution */ mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); return -ENODEV; } - ret = btrfs_scrub_cancel_dev(root, dev); + ret = btrfs_scrub_cancel_dev(fs_info, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); return ret; @@ -2532,7 +2524,7 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); + dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (dev) sctx = dev->scrub_device; if (sctx) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index acd2df85bed..a1a6c296ddc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - __btrfs_scrub_cancel(fs_info); + btrfs_scrub_cancel(fs_info); // WARN_ON(1); } } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cd8a32c448..d2c0bccca60 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1398,7 +1398,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root, devid, dev_uuid, + device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); if (!device) { ret = -ENOENT; @@ -1435,7 +1435,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) spin_unlock(&root->fs_info->free_chunk_lock); device->in_fs_metadata = 0; - btrfs_scrub_cancel_dev(root, device); + btrfs_scrub_cancel_dev(root->fs_info, device); /* * the device list mutex makes sure that we don't change @@ -1492,7 +1492,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super) { + if (clear_super && disk_super) { /* make sure this device isn't detected as part of * the FS anymore */ @@ -1540,7 +1540,7 @@ int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(root, devid, dev_uuid, + *device = btrfs_find_device(root->fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); if (!*device) @@ -1699,7 +1699,8 @@ next_slot: read_extent_buffer(leaf, fs_uuid, (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -4463,13 +4464,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, return 0; } -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - cur_devices = root->fs_info->fs_devices; + cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { @@ -4567,8 +4568,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, - NULL); + map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, + uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { kfree(map); free_extent_map(em); @@ -4686,7 +4687,7 @@ static int read_one_dev(struct btrfs_root *root, return ret; } - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device || !device->bdev) { if (!btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -5078,7 +5079,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, stats->devid, NULL, NULL); + dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7eaaf4e6195..802e2ba02f0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -281,7 +281,7 @@ void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -- cgit v1.2.3 From 1acd6831d98779c88cd57f0a5826d6df0b09f3fa Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:11:06 +0100 Subject: Btrfs: avoid risk of a deadlock in btrfs_handle_error Remove the attempt to cancel a running scrub or device replace operation in btrfs_handle_error() because it adds the risk of a deadlock. The only penalty of not canceling the operation is that some I/O remains active until the procedure completes. This is basically the same thing that happens to other tasks that are running in user mode context, they are not affected or stopped in btrfs_handle_error(), these tasks just need to handle write errors correctly. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a1a6c296ddc..ef2415896b0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - btrfs_scrub_cancel(fs_info); + /* + * Note that a running device replace operation is not + * canceled here although there is no way to update + * the progress. It would add the risk of a deadlock, + * therefore the canceling is ommited. The only penalty + * is that some I/O remains active until the procedure + * completes. The next time when the filesystem is + * mounted writeable again, the device replace + * operation continues. + */ // WARN_ON(1); } } -- cgit v1.2.3 From e922e087a35c437acef3bc88ce31e59c699c38bd Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:26:40 +0100 Subject: Btrfs: enhance btrfs structures for device replace support Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 39 +++++++++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.c | 5 +++++ 2 files changed, 44 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f8bb62c82b0..0781fd4a5c1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -885,6 +885,42 @@ struct btrfs_dev_stats_item { __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; } __attribute__ ((__packed__)); +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace { + u64 replace_state; /* see #define above */ + u64 time_started; /* seconds since 1-Jan-1970 */ + u64 time_stopped; /* seconds since 1-Jan-1970 */ + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + u64 cont_reading_from_srcdev_mode; /* see #define above */ + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + pid_t lock_owner; + atomic_t nesting_level; + struct mutex lock_finishing_cancel_unmount; + struct mutex lock_management_lock; + struct mutex lock; + + struct btrfs_scrub_progress scrub_progress; +}; + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1471,6 +1507,9 @@ struct btrfs_fs_info { int backup_root_index; int num_tolerated_disk_barrier_failures; + + /* device replace state */ + struct btrfs_dev_replace dev_replace; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42a8024e935..9d1b7106081 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2131,6 +2131,11 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->extent_commit_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); spin_lock_init(&fs_info->qgroup_lock); fs_info->qgroup_tree = RB_ROOT; -- cgit v1.2.3 From a2bff64025d7a707ac49155bb6678a636e55096e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:32:20 +0100 Subject: Btrfs: introduce a btrfs_dev_replace_item type Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/print-tree.c | 3 +++ 2 files changed, 69 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0781fd4a5c1..147406d0f9a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -921,6 +921,23 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1762,6 +1779,12 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_DEV_STATS_KEY 249 +/* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -2795,6 +2818,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5e23684887e..50d95fd190a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DEV_STATS_KEY: printk(KERN_INFO "\t\tdevice stats\n"); break; + case BTRFS_DEV_REPLACE_KEY: + printk(KERN_INFO "\t\tdev replace\n"); + break; }; } } -- cgit v1.2.3 From 5ac00addc7ac09110995fe967071d191b5981cc1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:54:08 +0100 Subject: Btrfs: disallow mutually exclusive admin operations from user mode Btrfs admin operations that are manually started from user mode and that cannot be executed at the same time return -EINPROGRESS. A common way to enter and leave this locked section is introduced since it used to be specific to the balance operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/ioctl.c | 53 ++++++++++++++++++++++++++++++++++++----------------- fs/btrfs/volumes.c | 2 ++ 3 files changed, 40 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 147406d0f9a..e9dc78014f0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1527,6 +1527,8 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; + + atomic_t mutually_exclusive_operation_running; }; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b40b827f93e..26f46dad3b0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1317,13 +1317,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1419,6 +1419,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2160,9 +2161,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (btrfs_root_readonly(root)) return -EROFS; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; + } ret = mnt_want_write_file(file); - if (ret) + if (ret) { + atomic_set(&root->fs_info->mutually_exclusive_operation_running, + 0); return ret; + } switch (inode->i_mode & S_IFMT) { case S_IFDIR: @@ -2214,6 +2223,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) } out: mnt_drop_write_file(file); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2225,13 +2235,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2244,6 +2254,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -2258,13 +2269,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + return -EINPROGRESS; } + mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2277,6 +2288,7 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3319,6 +3331,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; int ret; + int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3354,10 +3367,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (fs_info->balance_ctl) { + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); ret = -EINPROGRESS; goto out_bargs; } + need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3391,6 +3407,9 @@ do_balance: out_bargs: kfree(bargs); out: + if (need_to_clear_lock) + atomic_set(&root->fs_info->mutually_exclusive_operation_running, + 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); mnt_drop_write_file(file); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d2c0bccca60..33ca36b37a6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2952,6 +2952,7 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -2974,6 +2975,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); -- cgit v1.2.3 From 63a212abc2315972b245f93cb11ae3acf3c0b513 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:29:28 +0100 Subject: Btrfs: disallow some operations on the device replace target device This patch adds some code to disallow operations on the device that is used as the target for the device replace operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/ioctl.c | 8 +++++++- fs/btrfs/scrub.c | 14 +++++++++----- fs/btrfs/super.c | 3 ++- fs/btrfs/volumes.c | 41 ++++++++++++++++++++++++++++++++--------- fs/btrfs/volumes.h | 1 + 7 files changed, 54 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e9dc78014f0..746cb6aa1f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3649,7 +3649,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly); + int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b4d438f6c2b..98af8379895 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7468,7 +7468,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * check to make sure we can actually find a chunk with enough * space to fit our block group in. */ - if (device->total_bytes > device->bytes_used + min_free) { + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 26f46dad3b0..e54b5e50c92 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1375,6 +1375,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } } + if (device->is_tgtdev_for_dev_replace) { + ret = -EINVAL; + goto out_free; + } + old_size = device->total_bytes; if (mod < 0) { @@ -3102,7 +3107,8 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) return PTR_ERR(sa); ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, - &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6cf23f4f7bb..460e30bb188 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -116,6 +116,9 @@ struct scrub_ctx { u32 sectorsize; u32 nodesize; u32 leafsize; + + int is_dev_replace; + /* * statistics */ @@ -284,7 +287,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) } static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { struct scrub_ctx *sctx; int i; @@ -296,6 +299,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + sctx->is_dev_replace = is_dev_replace; sctx->pages_per_bio = pages_per_bio; sctx->curr = -1; sctx->dev_root = dev->dev_root; @@ -2293,7 +2297,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, - int readonly) + int readonly, int is_dev_replace) { struct scrub_ctx *sctx; int ret; @@ -2356,14 +2360,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev || dev->missing) { + if (!dev || (dev->missing && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -ENODEV; } mutex_lock(&fs_info->scrub_lock); - if (!dev->in_fs_metadata) { + if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); @@ -2376,7 +2380,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); return -EINPROGRESS; } - sctx = scrub_setup_ctx(dev); + sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef2415896b0..837ad2d2785 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1354,7 +1354,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) min_stripe_size = BTRFS_STRIPE_LEN; list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev) + if (!device->in_fs_metadata || !device->bdev || + device->is_tgtdev_for_dev_replace) continue; avail_space = device->total_bytes - device->bytes_used; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 33ca36b37a6..31f7af878d9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -518,8 +518,9 @@ again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) { - if (!latest_transid || - device->generation > latest_transid) { + if (!device->is_tgtdev_for_dev_replace && + (!latest_transid || + device->generation > latest_transid)) { latest_devid = device->devid; latest_transid = device->generation; latest_bdev = device->bdev; @@ -814,7 +815,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, *length = 0; - if (start >= device->total_bytes) + if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) return 0; path = btrfs_alloc_path(); @@ -931,7 +932,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, max_hole_size = 0; hole_size = 0; - if (search_start >= search_end) { + if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto error; } @@ -1114,6 +1115,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; WARN_ON(!device->in_fs_metadata); + WARN_ON(device->is_tgtdev_for_dev_replace); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1375,7 +1377,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * is held. */ list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { + if (tmp->in_fs_metadata && + !tmp->is_tgtdev_for_dev_replace && + !tmp->bdev) { device = tmp; break; } @@ -1406,6 +1410,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } + if (device->is_tgtdev_for_dev_replace) { + pr_err("btrfs: unable to remove the dev_replace target dev\n"); + ret = -EINVAL; + goto error_brelse; + } + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { printk(KERN_ERR "btrfs: unable to remove the only writeable " "device\n"); @@ -1425,6 +1435,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + /* + * TODO: the superblock still includes this device in its num_devices + * counter although write_all_supers() is not locked out. This + * could give a filesystem state which requires a degraded mount. + */ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) goto error_undo; @@ -1808,6 +1823,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 0; device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); @@ -1971,7 +1987,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; - if (new_size <= device->total_bytes) + if (new_size <= device->total_bytes || + device->is_tgtdev_for_dev_replace) return -EINVAL; btrfs_set_super_total_bytes(super_copy, old_total + diff); @@ -2600,7 +2617,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free) + device->total_bytes - device->bytes_used > size_to_free || + device->is_tgtdev_for_dev_replace) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -3132,6 +3150,9 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; + if (device->is_tgtdev_for_dev_replace) + return -EINVAL; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3401,7 +3422,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, continue; } - if (!device->in_fs_metadata) + if (!device->in_fs_metadata || + device->is_tgtdev_for_dev_replace) continue; if (device->total_bytes > device->bytes_used) @@ -4612,6 +4634,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -4722,7 +4745,7 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; spin_lock(&root->fs_info->free_chunk_lock); root->fs_info->free_chunk_space += device->total_bytes - diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 802e2ba02f0..8fd5a4d8acc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -50,6 +50,7 @@ struct btrfs_device { int in_fs_metadata; int missing; int can_discard; + int is_tgtdev_for_dev_replace; spinlock_t io_lock; -- cgit v1.2.3 From 618919236ba54361e93106f4951d233a7ade63cd Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:51:52 +0100 Subject: Btrfs: handle errors from btrfs_map_bio() everywhere With the addition of the device replace procedure, it is possible for btrfs_map_bio(READ) to report an error. This happens when the specific mirror is requested which is located on the target disk, and the copy operation has not yet copied this block. Hence the block cannot be read and this error state is indicated by returning EIO. Some background information follows now. A new mirror is added while the device replace procedure is running. btrfs_get_num_copies() returns one more, and btrfs_map_bio(GET_READ_MIRROR) adds one more mirror if a disk location is involved that was already handled by the device replace copy operation. The assigned mirror num is the highest mirror number, e.g. the value 3 in case of RAID1. If btrfs_map_bio() is invoked with mirror_num == 0 (i.e., select any mirror), the copy on the target drive is never selected because that disk shall be able to perform the write requests as quickly as possible. The parallel execution of read requests would only slow down the disk copy procedure. Second case is that btrfs_map_bio() is called with mirror_num > 0. This is done from the repair code only. In this case, the highest mirror num is assigned to the target disk, since it is used last. And when this mirror is not available because the copy procedure has not yet handled this area, an error is returned. Everywhere in the code the handling of such errors is added now. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 15 +++++++++++++-- fs/btrfs/compression.c | 6 ++++-- fs/btrfs/disk-io.c | 44 +++++++++++++++++++++++++++----------------- fs/btrfs/extent_io.c | 4 ---- fs/btrfs/inode.c | 27 ++++++++++++++++++++------- fs/btrfs/volumes.c | 2 +- 6 files changed, 65 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 8f9abedae2c..badc6f141b6 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1585,6 +1585,18 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + device = multi->stripes[0].dev; block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; @@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - if (0 == ret) - kfree(multi); + kfree(multi); if (NULL == block_ctx_out->dev) { ret = -ENXIO; printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6467aa88be..94ab2f80e7e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); @@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); return 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9d1b7106081..0e410478ad2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -852,11 +852,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int ret; + /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } static int check_async_write(struct inode *inode, unsigned long bio_flags) @@ -878,7 +883,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret; if (!(rw & REQ_WRITE)) { - /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -886,26 +890,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else { + /* + * kthread helpers are used to submit writes so that + * checksumming can happen in parallel across all CPUs + */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); } - /* - * kthread helpers are used to submit writes so that checksumming - * can happen in parallel across all CPUs - */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, - bio_offset, - __btree_submit_bio_start, - __btree_submit_bio_done); + if (ret) { +out_w_error: + bio_endio(bio, ret); + } + return ret; } #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 62ec6e45f70..1b319df29ee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2462,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -/* - * Since writes are async, they will only return -ENOMEM. - * Reads can return the full range of I/O error conditions. - */ static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5d1675a8c9e..d7bf2e7ee8a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1602,7 +1602,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; - return btrfs_map_bio(root, rw, bio, mirror_num, 1); + int ret; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; } /* @@ -1626,15 +1631,17 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) - return ret; + goto out; if (bio_flags & EXTENT_BIO_COMPRESSED) { - return btrfs_submit_compressed_read(inode, bio, - mirror_num, bio_flags); + ret = btrfs_submit_compressed_read(inode, bio, + mirror_num, + bio_flags); + goto out; } else if (!skip_sum) { ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); if (ret) - return ret; + goto out; } goto mapit; } else if (!skip_sum) { @@ -1642,15 +1649,21 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); + goto out; } mapit: - return btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: + if (ret < 0) + bio_endio(bio, ret); + return ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 31f7af878d9..415862885b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4435,7 +4435,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num); - if (ret) /* -ENOMEM */ + if (ret) return ret; total_devs = bbio->num_stripes; -- cgit v1.2.3 From ff023aac31198e88507d626825379b28ea481d4d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 11:43:11 +0100 Subject: Btrfs: add code to scrub to copy read data to another disk The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit adds code to scrub to allow the scrub code to copy read data to another disk. One goal is to be able to perform as fast as possible. Therefore the write requests are collected until huge bios are built, and the write process is decoupled from the read process with some kind of flow control, of course, in order to limit the allocated memory. The best performance on spinning disks could by reached when the head movements are avoided as much as possible. Therefore a single worker is used to interface the read process with the write process. The regular scrub operation works as fast as before, it is not negatively influenced and actually it is more or less unchanged. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 + fs/btrfs/dev-replace.h | 26 ++ fs/btrfs/reada.c | 10 +- fs/btrfs/scrub.c | 883 +++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/super.c | 3 +- 5 files changed, 851 insertions(+), 73 deletions(-) create mode 100644 fs/btrfs/dev-replace.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746cb6aa1f6..ded7caa0d30 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1483,6 +1483,8 @@ struct btrfs_fs_info { struct rw_semaphore scrub_super_lock; int scrub_workers_refcnt; struct btrfs_workers scrub_workers; + struct btrfs_workers scrub_wr_completion_workers; + struct btrfs_workers scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 00000000000..1fb5c89037e --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ + atomic64_inc(stat_value); +} +#endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 0ddc5659f94..9f363e17ec7 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -418,12 +418,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, */ continue; } + if (!dev->bdev) { + /* cannot read ahead on missing device */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { dev = bbio->stripes[i].dev; BUG_ON(dev == NULL); + /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); } BUG_ON(fs_info == NULL); @@ -914,7 +919,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - reada_add_block(rc, start, &max_key, level, generation); + if (reada_add_block(rc, start, &max_key, level, generation)) { + kfree(rc); + return ERR_PTR(-ENOMEM); + } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 460e30bb188..61157a26cf2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" @@ -44,8 +45,15 @@ struct scrub_block; struct scrub_ctx; -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ /* * the following value times PAGE_SIZE needs to be large enough to match the @@ -62,6 +70,7 @@ struct scrub_page { u64 generation; u64 logical; u64 physical; + u64 physical_for_dev_replace; atomic_t ref_count; struct { unsigned int mirror_num:8; @@ -79,7 +88,11 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; +#else + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif int page_count; int next_free; struct btrfs_work work; @@ -99,8 +112,16 @@ struct scrub_block { }; }; +struct scrub_wr_ctx { + struct scrub_bio *wr_curr_bio; + struct btrfs_device *tgtdev; + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ + atomic_t flush_all_writes; + struct mutex wr_lock; +}; + struct scrub_ctx { - struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; struct btrfs_root *dev_root; int first_free; int curr; @@ -112,12 +133,13 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + int pages_per_rd_bio; u32 sectorsize; u32 nodesize; u32 leafsize; int is_dev_replace; + struct scrub_wr_ctx wr_ctx; /* * statistics @@ -135,6 +157,15 @@ struct scrub_fixup_nodatasum { int mirror_num; }; +struct scrub_copy_nocow_ctx { + struct scrub_ctx *sctx; + u64 logical; + u64 len; + int mirror_num; + u64 physical_for_dev_replace; + struct btrfs_work work; +}; + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -156,8 +187,9 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, - struct scrub_block *sblock); + struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, @@ -174,6 +206,9 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); @@ -181,14 +216,38 @@ static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force); + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio, int err); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + void *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -262,19 +321,20 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; + scrub_free_wr_ctx(&sctx->wr_ctx); + /* this can happen when scrub is cancelled */ if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; for (i = 0; i < sbio->page_count; i++) { - BUG_ON(!sbio->pagev[i]); - BUG_ON(!sbio->pagev[i]->page); + WARN_ON(!sbio->pagev[i]->page); scrub_block_put(sbio->pagev[i]->sblock); } bio_put(sbio->bio); } - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio = sctx->bios[i]; if (!sbio) @@ -292,18 +352,29 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_bio; + int pages_per_rd_bio; + int ret; - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, - bio_get_nr_vecs(dev->bdev)); + /* + * the setting of pages_per_rd_bio is correct for scrub but might + * be wrong for the dev_replace code where we might read from + * different devices in the initial huge bios. However, that + * code is able to correctly handle the case when adding a page + * to a bio fails. + */ + if (dev->bdev) + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, + bio_get_nr_vecs(dev->bdev)); + else + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = pages_per_bio; + sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; sctx->dev_root = dev->dev_root; - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); @@ -316,7 +387,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->page_count = 0; sbio->work.func = scrub_bio_end_io_worker; - if (i != SCRUB_BIOS_PER_CTX - 1) + if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; else sctx->bios[i]->next_free = -1; @@ -334,6 +405,13 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, + fs_info->dev_replace.tgtdev, is_dev_replace); + if (ret) { + scrub_free_ctx(sctx); + return ERR_PTR(ret); + } return sctx; nomem: @@ -341,7 +419,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, + void *warn_ctx) { u64 isize; u32 nlink; @@ -349,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) int i; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct scrub_warning *swarn = ctx; + struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; @@ -492,11 +571,11 @@ out: kfree(swarn.msg_buf); } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) { struct page *page = NULL; unsigned long index; - struct scrub_fixup_nodatasum *fixup = ctx; + struct scrub_fixup_nodatasum *fixup = fixup_ctx; int ret; int corrected = 0; struct btrfs_key key; @@ -660,7 +739,9 @@ out: spin_lock(&sctx->stat_lock); ++sctx->stat.uncorrectable_errors; spin_unlock(&sctx->stat_lock); - + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info->dev_replace. + num_uncorrectable_read_errors); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", (unsigned long long)fixup->logical, @@ -715,6 +796,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) csum = sblock_to_check->pagev[0]->csum; dev = sblock_to_check->pagev[0]->dev; + if (sctx->is_dev_replace && !is_metadata && !have_csum) { + sblocks_for_recheck = NULL; + goto nodatasum_case; + } + /* * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was @@ -758,7 +844,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, length, + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, logical, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -789,6 +875,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sctx->stat.unverified_errors++; spin_unlock(&sctx->stat_lock); + if (sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock_bad); goto out; } @@ -822,12 +910,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly) + if (sctx->readonly && !sctx->is_dev_replace) goto did_not_correct_error; if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; +nodatasum_case: + WARN_ON(sctx->is_dev_replace); + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -883,18 +974,79 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { - int force_write = is_metadata || have_csum; - - ret = scrub_repair_block_from_good_copy(sblock_bad, - sblock_other, - force_write); + if (sctx->is_dev_replace) { + scrub_write_block_to_dev_replace(sblock_other); + } else { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy( + sblock_bad, sblock_other, + force_write); + } if (0 == ret) goto corrected_error; } } /* - * in case of I/O errors in the area that is supposed to be + * for dev_replace, pick good pages and write to the target device. + */ + if (sctx->is_dev_replace) { + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { + int sub_success; + + sub_success = 0; + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = + sblocks_for_recheck + mirror_index; + struct scrub_page *page_other = + sblock_other->pagev[page_num]; + + if (!page_other->io_error) { + ret = scrub_write_page_to_dev_replace( + sblock_other, page_num); + if (ret == 0) { + /* succeeded for this page */ + sub_success = 1; + break; + } else { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + } + } + } + + if (!sub_success) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + success = 0; + ret = scrub_write_page_to_dev_replace( + sblock_bad, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sctx->dev_root->fs_info-> + dev_replace.num_write_errors); + } + } + + goto out; + } + + /* + * for regular scrub, repair those pages that are errored. + * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from * the area to fix. Afterwards verify the checksum of the block @@ -1017,6 +1169,7 @@ out: static int scrub_setup_recheck_block(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info, + struct scrub_block *original_sblock, u64 length, u64 logical, struct scrub_block *sblocks_for_recheck) { @@ -1047,7 +1200,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, return -EIO; } - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; mirror_index++) { struct scrub_block *sblock; @@ -1071,6 +1224,10 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; + BUG_ON(page_index >= original_sblock->page_count); + page->physical_for_dev_replace = + original_sblock->pagev[page_index]-> + physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; @@ -1249,6 +1406,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; DECLARE_COMPLETION_ONSTACK(complete); + if (!page_bad->dev->bdev) { + printk_ratelimited(KERN_WARNING + "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1269,6 +1432,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (!bio_flagged(bio, BIO_UPTODATE)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_replace_stats_inc( + &sblock_bad->sctx->dev_root->fs_info-> + dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1278,7 +1444,168 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return 0; } -static void scrub_checksum(struct scrub_block *sblock) +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ + int page_num; + + for (page_num = 0; page_num < sblock->page_count; page_num++) { + int ret; + + ret = scrub_write_page_to_dev_replace(sblock, page_num); + if (ret) + btrfs_dev_replace_stats_inc( + &sblock->sctx->dev_root->fs_info->dev_replace. + num_write_errors); + } +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, + int page_num) +{ + struct scrub_page *spage = sblock->pagev[page_num]; + + BUG_ON(spage->page == NULL); + if (spage->io_error) { + void *mapped_buffer = kmap_atomic(spage->page); + + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + flush_dcache_page(spage->page); + kunmap_atomic(mapped_buffer); + } + return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + int ret; + + mutex_lock(&wr_ctx->wr_lock); +again: + if (!wr_ctx->wr_curr_bio) { + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), + GFP_NOFS); + if (!wr_ctx->wr_curr_bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + wr_ctx->wr_curr_bio->sctx = sctx; + wr_ctx->wr_curr_bio->page_count = 0; + } + sbio = wr_ctx->wr_curr_bio; + if (sbio->page_count == 0) { + struct bio *bio; + + sbio->physical = spage->physical_for_dev_replace; + sbio->logical = spage->logical; + sbio->dev = wr_ctx->tgtdev; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + if (!bio) { + mutex_unlock(&wr_ctx->wr_lock); + return -ENOMEM; + } + sbio->bio = bio; + } + + bio->bi_private = sbio; + bio->bi_end_io = scrub_wr_bio_end_io; + bio->bi_bdev = sbio->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical_for_dev_replace || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { + scrub_wr_submit(sctx); + goto again; + } + + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); + return -EIO; + } + scrub_wr_submit(sctx); + goto again; + } + + sbio->pagev[sbio->page_count] = spage; + scrub_page_get(spage); + sbio->page_count++; + if (sbio->page_count == wr_ctx->pages_per_wr_bio) + scrub_wr_submit(sctx); + mutex_unlock(&wr_ctx->wr_lock); + + return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; + struct scrub_bio *sbio; + + if (!wr_ctx->wr_curr_bio) + return; + + sbio = wr_ctx->wr_curr_bio; + wr_ctx->wr_curr_bio = NULL; + WARN_ON(!sbio->bio->bi_bdev); + scrub_pending_bio_inc(sctx); + /* process all writes in a single worker thread. Then the block layer + * orders the requests before sending them to the driver which + * doubled the write performance on spinning disks when measured + * with Linux 3.5 */ + btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + sbio->work.func = scrub_wr_bio_end_io_worker; + btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_ctx *sctx = sbio->sctx; + int i; + + WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); + if (sbio->err) { + struct btrfs_dev_replace *dev_replace = + &sbio->sctx->dev_root->fs_info->dev_replace; + + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + btrfs_dev_replace_stats_inc(&dev_replace-> + num_write_errors); + } + } + + for (i = 0; i < sbio->page_count; i++) + scrub_page_put(sbio->pagev[i]); + + bio_put(sbio->bio); + kfree(sbio); + scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock) { u64 flags; int ret; @@ -1296,6 +1623,8 @@ static void scrub_checksum(struct scrub_block *sblock) WARN_ON(1); if (ret) scrub_handle_errored_block(sblock); + + return ret; } static int scrub_checksum_data(struct scrub_block *sblock) @@ -1386,7 +1715,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - BUG_ON(sctx->nodesize != sctx->leafsize); + WARN_ON(sctx->nodesize != sctx->leafsize); len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; @@ -1534,11 +1863,24 @@ static void scrub_submit(struct scrub_ctx *sctx) sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + if (!sbio->bio->bi_bdev) { + /* + * this case should not happen. If btrfs_map_block() is + * wrong, it could happen for dev-replace operations on + * missing devices when no mirrors are available, but in + * this case it should already fail the mount. + * This case is handled correctly (but _very_ slowly). + */ + printk_ratelimited(KERN_WARNING + "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); + bio_endio(sbio->bio, -EIO); + } else { + btrfsic_submit_bio(READ, sbio->bio); + } } -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_page *spage) { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; @@ -1570,7 +1912,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); + bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -1602,10 +1944,10 @@ again: goto again; } - scrub_block_get(sblock); /* one for the added page */ + scrub_block_get(sblock); /* one for the page added to the bio */ atomic_inc(&sblock->outstanding_pages); sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + if (sbio->page_count == sctx->pages_per_rd_bio) scrub_submit(sctx); return 0; @@ -1613,7 +1955,8 @@ again: static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force) + u64 gen, int mirror_num, u8 *csum, int force, + u64 physical_for_dev_replace) { struct scrub_block *sblock; int index; @@ -1654,6 +1997,7 @@ leave_nomem: spage->generation = gen; spage->logical = logical; spage->physical = physical; + spage->physical_for_dev_replace = physical_for_dev_replace; spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; @@ -1668,6 +2012,7 @@ leave_nomem: len -= l; logical += l; physical += l; + physical_for_dev_replace += l; } WARN_ON(sblock->page_count == 0); @@ -1675,7 +2020,7 @@ leave_nomem: struct scrub_page *spage = sblock->pagev[index]; int ret; - ret = scrub_add_page_to_bio(sctx, spage); + ret = scrub_add_page_to_rd_bio(sctx, spage); if (ret) { scrub_block_put(sblock); return ret; @@ -1707,7 +2052,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) struct scrub_ctx *sctx = sbio->sctx; int i; - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); if (sbio->err) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -1733,15 +2078,30 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sbio->next_free = sctx->first_free; sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + scrub_pending_bio_dec(sctx); } static void scrub_block_complete(struct scrub_block *sblock) { - if (!sblock->no_io_error_seen) + if (!sblock->no_io_error_seen) { scrub_handle_errored_block(sblock); - else - scrub_checksum(sblock); + } else { + /* + * if has checksum error, write via repair mechanism in + * dev replace case, otherwise write here in dev replace + * case. + */ + if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + scrub_write_block_to_dev_replace(sblock); + } } static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -1786,7 +2146,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num) + u64 gen, int mirror_num, u64 physical_for_dev_replace) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -1799,7 +2159,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sctx->nodesize != sctx->leafsize); + WARN_ON(sctx->nodesize != sctx->leafsize); blocksize = sctx->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; @@ -1807,7 +2167,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, spin_unlock(&sctx->stat_lock); } else { blocksize = sctx->sectorsize; - BUG_ON(1); + WARN_ON(1); } while (len) { @@ -1819,14 +2179,23 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, have_csum = scrub_find_csum(sctx, logical, l, csum); if (have_csum == 0) ++sctx->stat.no_csum; + if (sctx->is_dev_replace && !have_csum) { + ret = copy_nocow_pages(sctx, logical, l, + mirror_num, + physical_for_dev_replace); + goto behind_scrub_pages; + } } ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + mirror_num, have_csum ? csum : NULL, 0, + physical_for_dev_replace); +behind_scrub_pages: if (ret) return ret; len -= l; logical += l; physical += l; + physical_for_dev_replace += l; } return 0; } @@ -1834,7 +2203,8 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length) + int num, u64 base, u64 length, + int is_dev_replace) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; @@ -1859,6 +2229,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + int extent_mirror_num; nstripes = length; offset = 0; @@ -1966,9 +2341,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); mutex_lock(&fs_info->scrub_lock); @@ -2063,10 +2443,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key.objectid; } - ret = scrub_extent(sctx, key.objectid, key.offset, - key.objectid - logical + physical, - scrub_dev, flags, generation, - mirror_num); + extent_logical = key.objectid; + extent_physical = key.objectid - logical + physical; + extent_len = key.offset; + extent_dev = scrub_dev; + extent_mirror_num = mirror_num; + if (is_dev_replace) + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + ret = scrub_extent(sctx, extent_logical, extent_len, + extent_physical, extent_dev, flags, + generation, extent_mirror_num, + key.objectid - logical + physical); if (ret) goto out; @@ -2080,10 +2470,13 @@ next: sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); } +out: /* push queued extents */ scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); -out: blk_finish_plug(&plug); btrfs_free_path(path); return ret < 0 ? ret : 0; @@ -2093,14 +2486,14 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) + u64 dev_offset, int is_dev_replace) { struct btrfs_mapping_tree *map_tree = &sctx->dev_root->fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; - int ret = -EINVAL; + int ret = 0; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); @@ -2120,7 +2513,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length); + chunk_offset, length, + is_dev_replace); if (ret) goto out; } @@ -2133,7 +2527,8 @@ out: static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, - struct btrfs_device *scrub_dev, u64 start, u64 end) + struct btrfs_device *scrub_dev, u64 start, u64 end, + int is_dev_replace) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; @@ -2149,6 +2544,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_block_group_cache *cache; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); if (!path) @@ -2214,11 +2610,61 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOENT; break; } + dev_replace->cursor_right = found_key.offset + length; + dev_replace->cursor_left = found_key.offset; + dev_replace->item_needs_writeback = 1; ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, - chunk_offset, length, found_key.offset); + chunk_offset, length, found_key.offset, + is_dev_replace); + + /* + * flush, submit all pending read and write bios, afterwards + * wait for them. + * Note that in the dev replace case, a read request causes + * write requests that are submitted in the read completion + * worker. Therefore in the current situation, it is required + * that all write requests are flushed, so that all read and + * write requests are really completed when bios_in_flight + * changes to 0. + */ + atomic_set(&sctx->wr_ctx.flush_all_writes, 1); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + wait_event(sctx->list_wait, + atomic_read(&sctx->workers_pending) == 0); + + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); + } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); + + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; btrfs_put_block_group(cache); if (ret) break; + if (atomic64_read(&dev_replace->num_write_errors) > 0) { + ret = -EIO; + break; + } + if (sctx->stat.malloc_errors > 0) { + ret = -ENOMEM; + break; + } key.offset = found_key.offset + length; btrfs_release_path(path); @@ -2254,7 +2700,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, 1); + NULL, 1, bytenr); if (ret) return ret; } @@ -2266,18 +2712,38 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + int is_dev_replace) { int ret = 0; mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); + if (is_dev_replace) + btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, + &fs_info->generic_worker); + else + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, + &fs_info->generic_worker); fs_info->scrub_workers.idle_thresh = 4; ret = btrfs_start_workers(&fs_info->scrub_workers); if (ret) goto out; + btrfs_init_workers(&fs_info->scrub_wr_completion_workers, + "scrubwrc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + fs_info->scrub_wr_completion_workers.idle_thresh = 2; + ret = btrfs_start_workers( + &fs_info->scrub_wr_completion_workers); + if (ret) + goto out; + btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, + &fs_info->generic_worker); + ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); + if (ret) + goto out; } ++fs_info->scrub_workers_refcnt; out: @@ -2289,8 +2755,11 @@ out: static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { mutex_lock(&fs_info->scrub_lock); - if (--fs_info->scrub_workers_refcnt == 0) + if (--fs_info->scrub_workers_refcnt == 0) { btrfs_stop_workers(&fs_info->scrub_workers); + btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); + btrfs_stop_workers(&fs_info->scrub_nocow_workers); + } WARN_ON(fs_info->scrub_workers_refcnt < 0); mutex_unlock(&fs_info->scrub_lock); } @@ -2354,7 +2823,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - ret = scrub_workers_get(fs_info); + ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) return ret; @@ -2394,12 +2863,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sctx, dev); - up_read(&fs_info->scrub_super_lock); + if (!is_dev_replace) { + down_read(&fs_info->scrub_super_lock); + ret = scrub_supers(sctx, dev); + up_read(&fs_info->scrub_super_lock); + } if (!ret) - ret = scrub_enumerate_chunks(sctx, dev, start, end); + ret = scrub_enumerate_chunks(sctx, dev, start, end, + is_dev_replace); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -2537,3 +3009,272 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, + u64 extent_logical, u64 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) +{ + u64 mapped_length; + struct btrfs_bio *bbio = NULL; + int ret; + + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (ret || !bbio || mapped_length < extent_len || + !bbio->stripes[0].dev->bdev) { + kfree(bbio); + return; + } + + *extent_physical = bbio->stripes[0].physical; + *extent_mirror_num = bbio->mirror_num; + *extent_dev = bbio->stripes[0].dev; + kfree(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, + struct scrub_wr_ctx *wr_ctx, + struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, + int is_dev_replace) +{ + WARN_ON(wr_ctx->wr_curr_bio != NULL); + + mutex_init(&wr_ctx->wr_lock); + wr_ctx->wr_curr_bio = NULL; + if (!is_dev_replace) + return 0; + + WARN_ON(!dev->bdev); + wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, + bio_get_nr_vecs(dev->bdev)); + wr_ctx->tgtdev = dev; + atomic_set(&wr_ctx->flush_all_writes, 0); + return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ + mutex_lock(&wr_ctx->wr_lock); + kfree(wr_ctx->wr_curr_bio); + wr_ctx->wr_curr_bio = NULL; + mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, + int mirror_num, u64 physical_for_dev_replace) +{ + struct scrub_copy_nocow_ctx *nocow_ctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + + nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); + if (!nocow_ctx) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + scrub_pending_trans_workers_inc(sctx); + + nocow_ctx->sctx = sctx; + nocow_ctx->logical = logical; + nocow_ctx->len = len; + nocow_ctx->mirror_num = mirror_num; + nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; + nocow_ctx->work.func = copy_nocow_pages_worker; + btrfs_queue_worker(&fs_info->scrub_nocow_workers, + &nocow_ctx->work); + + return 0; +} + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = + container_of(work, struct scrub_copy_nocow_ctx, work); + struct scrub_ctx *sctx = nocow_ctx->sctx; + u64 logical = nocow_ctx->logical; + u64 len = nocow_ctx->len; + int mirror_num = nocow_ctx->mirror_num; + u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + int ret; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + struct btrfs_root *root; + int not_written = 0; + + fs_info = sctx->dev_root->fs_info; + root = fs_info->extent_root; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + not_written = 1; + goto out; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + not_written = 1; + goto out; + } + + ret = iterate_inodes_from_logical(logical, fs_info, path, + copy_nocow_pages_for_inode, + nocow_ctx); + if (ret != 0 && ret != -ENOENT) { + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", + (unsigned long long)logical, + (unsigned long long)physical_for_dev_replace, + (unsigned long long)len, + (unsigned long long)mirror_num, ret); + not_written = 1; + goto out; + } + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, root); + if (not_written) + btrfs_dev_replace_stats_inc(&fs_info->dev_replace. + num_uncorrectable_read_errors); + + btrfs_free_path(path); + kfree(nocow_ctx); + + scrub_pending_trans_workers_dec(sctx); +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + unsigned long index; + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + int ret = 0; + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_root *local_root; + u64 physical_for_dev_replace; + u64 len; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; + len = nocow_ctx->len; + while (len >= PAGE_CACHE_SIZE) { + struct page *page = NULL; + int ret_sub; + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + pr_err("find_or_create_page() failed\n"); + ret = -ENOMEM; + goto next_page; + } + + if (PageUptodate(page)) { + if (PageDirty(page)) + goto next_page; + } else { + ClearPageError(page); + ret_sub = extent_read_full_page(&BTRFS_I(inode)-> + io_tree, + page, btrfs_get_extent, + nocow_ctx->mirror_num); + if (ret_sub) { + ret = ret_sub; + goto next_page; + } + wait_on_page_locked(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto next_page; + } + } + ret_sub = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (ret_sub) { + ret = ret_sub; + goto next_page; + } + +next_page: + if (page) { + unlock_page(page); + put_page(page); + } + offset += PAGE_CACHE_SIZE; + physical_for_dev_replace += PAGE_CACHE_SIZE; + len -= PAGE_CACHE_SIZE; + } + + if (inode) + iput(inode); + return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, + u64 physical_for_dev_replace, struct page *page) +{ + struct bio *bio; + struct btrfs_device *dev; + int ret; + DECLARE_COMPLETION_ONSTACK(compl); + + dev = sctx->wr_ctx.tgtdev; + if (!dev) + return -EIO; + if (!dev->bdev) { + printk_ratelimited(KERN_WARNING + "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + bio->bi_private = &compl; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_size = 0; + bio->bi_sector = physical_for_dev_replace >> 9; + bio->bi_bdev = dev->bdev; + ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: + bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + return -EIO; + } + btrfsic_submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + goto leave_with_eio; + + bio_put(bio); + return 0; +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 837ad2d2785..ad4380684b9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1195,7 +1195,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, + new_pool_size); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) -- cgit v1.2.3 From e93c89c1aaaaaec3487c4c18dd02360371790722 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:33:06 +0100 Subject: Btrfs: add new sources for device replace code This adds a new file to the sources together with the header file and the changes to ioctl.h and ctree.h that are required by the new C source file. Additionally, 4 new functions are added to volume.c that deal with device creation and destruction. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 2 + fs/btrfs/dev-replace.c | 856 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/dev-replace.h | 18 ++ fs/btrfs/ioctl.h | 45 +++ fs/btrfs/volumes.c | 139 ++++++++ fs/btrfs/volumes.h | 8 + 7 files changed, 1069 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/dev-replace.c (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d7fcdba141a..7df3e0f0ee5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ded7caa0d30..45e7f752b64 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -142,6 +142,8 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +#define BTRFS_DEV_REPLACE_DEVID 0 + /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 00000000000..66dbc8dbddf --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,856 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" + +static u64 btrfs_get_seconds_since_1970(void); +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + ret = 0; + dev_replace->replace_state = + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->replace_state = 0; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, + NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info, + BTRFS_DEV_REPLACE_DEVID, + NULL, NULL); + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", + (unsigned long long)src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + } + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, + dev_replace->tgtdev); + } + break; + } + +out: + if (path) + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + pr_warn("btrfs: error %d while searching for dev_replace item!\n", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + pr_warn("btrfs: insert dev_replace item failed %d!\n", + ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + btrfs_dev_replace_lock(dev_replace); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + + return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + dev_replace->committed_cursor_left = + dev_replace->cursor_left_last_write_of_item; +} + +static u64 btrfs_get_seconds_since_1970(void) +{ + struct timespec t = CURRENT_TIME_SEC; + + return t.tv_sec; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + &tgt_device); + if (ret) { + pr_err("btrfs: target device %s is invalid!\n", + args->start.tgtdev_name); + mutex_unlock(&fs_info->volume_mutex); + return -EINVAL; + } + + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + args->start.srcdev_name, + &src_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) { + ret = -EINVAL; + goto leave_no_lock; + } + + if (tgt_device->total_bytes < src_device->total_bytes) { + pr_err("btrfs: target device is smaller than source device!\n"); + ret = -EINVAL; + goto leave_no_lock; + } + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = + args->start.cont_reading_from_srcdev_mode; + WARN_ON(!src_device); + dev_replace->srcdev = src_device; + WARN_ON(!tgt_device); + dev_replace->tgtdev = tgt_device; + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_wait_ordered_extents(root, 0); + + /* force writing the updated state information to disk */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_lock(dev_replace); + goto leave; + } + + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + src_device->total_bytes, + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(root->fs_info, ret); + WARN_ON(ret); + + return 0; + +leave: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + return ret; +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_dev_replace_lock(dev_replace); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_unlock(dev_replace); + + /* replace old device with new one in mapping tree */ + if (!scrub_ret) + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + btrfs_dev_replace_lock(dev_replace); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + + if (scrub_ret) { + printk_in_rcu(KERN_ERR + "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; + } + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + src_device->missing ? "" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + tgt_device->is_tgtdev_for_dev_replace = 0; + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + tgt_device->bytes_used = src_device->bytes_used; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + if (fs_info->sb->s_bdev == src_device->bdev) + fs_info->sb->s_bdev = tgt_device->bdev; + if (fs_info->fs_devices->latest_bdev == src_device->bdev) + fs_info->fs_devices->latest_bdev = tgt_device->bdev; + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + if (src_device->bdev) { + /* zero out the old super */ + btrfs_scratch_superblock(src_device); + } + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans, root); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + args->status.progress_1000 = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + args->status.progress_1000 = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, + div64_u64(dev_replace->srcdev->total_bytes, 1000)); + break; + } + btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + args->result = __btrfs_dev_replace_cancel(fs_info); + return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + u64 result; + int ret; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + btrfs_dev_replace_unlock(dev_replace); + goto leave; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + break; + } + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + pr_info("btrfs: suspending dev_replace for unmount\n"); + break; + } + + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + btrfs_dev_replace_unlock(dev_replace); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" + "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + WARN_ON(atomic_xchg( + &fs_info->mutually_exclusive_operation_running, 1)); + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_RET(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_ioctl_dev_replace_args *status_args; + u64 progress; + + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + if (status_args) { + btrfs_dev_replace_status(fs_info, status_args); + progress = status_args->status.progress_1000; + kfree(status_args); + do_div(progress, 10); + printk_in_rcu(KERN_INFO + "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "", + (unsigned int)progress); + } + btrfs_dev_replace_continue_on_mount(fs_info); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + + return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + dev_replace->srcdev->total_bytes, + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret); + return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancelation is wanted. + */ + break; + } + return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ + /* the beginning is just an optimization for the typical case */ + if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: + /* this is not a nested case where the same thread + * is trying to acqurire the same lock twice */ + mutex_lock(&dev_replace->lock); + mutex_lock(&dev_replace->lock_management_lock); + dev_replace->lock_owner = current->pid; + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_lock(&dev_replace->lock_management_lock); + if (atomic_read(&dev_replace->nesting_level) > 0 && + dev_replace->lock_owner == current->pid) { + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_unlock(&dev_replace->lock_management_lock); + goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + mutex_lock(&dev_replace->lock_management_lock); + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); + WARN_ON(dev_replace->lock_owner != current->pid); + atomic_dec(&dev_replace->nesting_level); + if (atomic_read(&dev_replace->nesting_level) == 0) { + dev_replace->lock_owner = 0; + mutex_unlock(&dev_replace->lock_management_lock); + mutex_unlock(&dev_replace->lock); + } else { + mutex_unlock(&dev_replace->lock_management_lock); + } +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 1fb5c89037e..20035cbbf02 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -19,6 +19,24 @@ #if !defined(__BTRFS_DEV_REPLACE__) #define __BTRFS_DEV_REPLACE__ +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { atomic64_inc(stat_value); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 731e2875ab9..62006ba0271 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -123,6 +123,48 @@ struct btrfs_ioctl_scrub_args { __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; }; +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +struct btrfs_ioctl_dev_replace_start_params { + __u64 srcdevid; /* in, if 0, use srcdev_name instead */ + __u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ + __u64 cont_reading_from_srcdev_mode; /* in, see #define + * above */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 +struct btrfs_ioctl_dev_replace_status_params { + __u64 replace_state; /* out, see #define above */ + __u64 progress_1000; /* out, 0 <= x <= 1000 */ + __u64 time_started; /* out, seconds since 1-Jan-1970 */ + __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ + __u64 num_write_errors; /* out */ + __u64 num_uncorrectable_read_errors; /* out */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 +struct btrfs_ioctl_dev_replace_args { + __u64 cmd; /* in */ + __u64 result; /* out */ + + union { + struct btrfs_ioctl_dev_replace_start_params start; + struct btrfs_ioctl_dev_replace_status_params status; + }; /* in/out */ + + __u64 spare[64]; +}; + #define BTRFS_DEVICE_PATH_NAME_MAX 1024 struct btrfs_ioctl_dev_info_args { __u64 devid; /* in/out */ @@ -453,4 +495,7 @@ struct btrfs_ioctl_send_args { struct btrfs_ioctl_qgroup_limit_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_dev_replace_args) + #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 415862885b6..5777e6a9aab 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1537,6 +1537,53 @@ error_undo: goto error_brelse; } +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + list_del_rcu(&srcdev->dev_list); + list_del_rcu(&srcdev->dev_alloc_list); + fs_info->fs_devices->num_devices--; + if (srcdev->missing) { + fs_info->fs_devices->missing_devices--; + fs_info->fs_devices->rw_devices++; + } + if (srcdev->can_discard) + fs_info->fs_devices->num_can_discard--; + if (srcdev->bdev) + fs_info->fs_devices->open_devices--; + + call_rcu(&srcdev->rcu, free_device); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + struct btrfs_device *next_device; + + WARN_ON(!tgtdev); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + if (tgtdev->bdev) { + btrfs_scratch_superblock(tgtdev); + fs_info->fs_devices->open_devices--; + } + fs_info->fs_devices->num_devices--; + if (tgtdev->can_discard) + fs_info->fs_devices->num_can_discard++; + + next_device = list_entry(fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (tgtdev->bdev == fs_info->sb->s_bdev) + fs_info->sb->s_bdev = next_device->bdev; + if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; + list_del_rcu(&tgtdev->dev_list); + + call_rcu(&tgtdev->rcu, free_device); + + mutex_unlock(&fs_info->fs_devices->device_list_mutex); +} + int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device) { @@ -1931,6 +1978,98 @@ error: return ret; } +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out) +{ + struct request_queue *q; + struct btrfs_device *device; + struct block_device *bdev; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *devices; + struct rcu_string *name; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + ret = -ENOMEM; + goto error; + } + + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + device->devid = BTRFS_DEV_REPLACE_DEVID; + spin_lock_init(&device->io_lock); + device->generation = 0; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; + device->dev_root = fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->is_tgtdev_for_dev_replace = 1; + device->mode = FMODE_EXCL; + set_blocksize(device->bdev, 4096); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + if (device->can_discard) + fs_info->fs_devices->num_can_discard++; + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return ret; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev) +{ + WARN_ON(fs_info->fs_devices->rw_devices == 0); + tgtdev->io_width = fs_info->dev_root->sectorsize; + tgtdev->io_align = fs_info->dev_root->sectorsize; + tgtdev->sector_size = fs_info->dev_root->sectorsize; + tgtdev->dev_root = fs_info->dev_root; + tgtdev->in_fs_metadata = 1; +} + static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8fd5a4d8acc..58d79375dea 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -286,6 +286,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); @@ -302,6 +304,12 @@ int btrfs_get_dev_stats(struct btrfs_root *root, int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, + struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, -- cgit v1.2.3 From 8dabb7420f014ab0f9f04afae8ae046c0f48b270 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 13:15:27 +0100 Subject: Btrfs: change core code of btrfs to support the device replace operations This commit contains all the essential changes to the core code of Btrfs for support of the device replace procedure. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 24 +++++++++++++++++++++- fs/btrfs/reada.c | 17 ++++++++++++++++ fs/btrfs/scrub.c | 7 ++++++- fs/btrfs/super.c | 13 ++++++++++++ fs/btrfs/transaction.c | 7 ++++++- fs/btrfs/volumes.c | 54 ++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.h | 3 ++- 7 files changed, 111 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e410478ad2..76b82506bf9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@ #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" +#include "dev-replace.h" #ifdef CONFIG_X86 #include @@ -2438,7 +2439,11 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } - btrfs_close_extra_devices(fs_devices); + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_CRIT "btrfs: failed to read devices on %s\n", @@ -2510,6 +2515,14 @@ retry_root_backup: goto fail_block_groups; } + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("btrfs: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_info, fs_devices, 1); + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2658,6 +2671,13 @@ retry_root_backup: return ret; } + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; + } + return 0; fail_qgroup: @@ -3300,6 +3320,8 @@ int close_ctree(struct btrfs_root *root) /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9f363e17ec7..c705a48e676 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -27,6 +27,7 @@ #include "volumes.h" #include "disk-io.h" #include "transaction.h" +#include "dev-replace.h" #undef DEBUG @@ -331,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; + int dev_replace_is_ongoing; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -392,6 +394,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, } /* insert extent in reada_tree + all per-device trees, all or nothing */ + btrfs_dev_replace_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -399,13 +402,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } prev_dev = NULL; + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( + &fs_info->dev_replace); for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; if (dev == prev_dev) { @@ -422,6 +429,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* cannot read ahead on missing device */ continue; } + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* + * as this device is selected for reading only as + * a last resort, skip it for read ahead. + */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { @@ -434,10 +449,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } } spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); kfree(bbio); return re; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61157a26cf2..30cbf6921c0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2843,12 +2843,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - if (dev->scrub_device) { + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (dev->scrub_device || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -EINPROGRESS; } + btrfs_dev_replace_unlock(&fs_info->dev_replace); sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ad4380684b9..def4f24b58d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "export.h" #include "compression.h" #include "rcu-string.h" +#include "dev-replace.h" #define CREATE_TRACE_POINTS #include @@ -1225,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ sb->s_flags |= MS_RDONLY; + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + ret = btrfs_commit_super(root); if (ret) goto restore; @@ -1263,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + goto restore; + } sb->s_flags &= ~MS_RDONLY; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7b297354e73..bcc6b65be3b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@ #include "tree-log.h" #include "inode-map.h" #include "volumes.h" +#include "dev-replace.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -845,7 +846,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; ret = btrfs_run_dev_stats(trans, root->fs_info); - BUG_ON(ret); + WARN_ON(ret); + ret = btrfs_run_dev_replace(trans, root->fs_info); + WARN_ON(ret); ret = btrfs_run_qgroups(trans, root->fs_info); BUG_ON(ret); @@ -868,6 +871,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, switch_commit_root(fs_info->extent_root); up_write(&fs_info->extent_commit_sem); + btrfs_after_dev_replace_commit(fs_info); + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5777e6a9aab..a4e0963bf45 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "math.h" +#include "dev-replace.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -505,7 +506,8 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; @@ -528,6 +530,21 @@ again: continue; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { + /* + * In the first step, keep the device which has + * the correct fsid and the devid that is used + * for the dev_replace procedure. + * In the second step, the dev_replace state is + * read from the device tree and it is known + * whether the procedure is really active or + * not, which means whether this device is + * used or whether it should be removed. + */ + if (step == 0 || device->is_tgtdev_for_dev_replace) { + continue; + } + } if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -536,7 +553,8 @@ again: if (device->writeable) { list_del_init(&device->dev_alloc_list); device->writeable = 0; - fs_devices->rw_devices--; + if (!device->is_tgtdev_for_dev_replace) + fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; @@ -594,7 +612,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->bdev) fs_devices->open_devices--; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } @@ -718,7 +736,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->rotating = 1; fs_devices->open_devices++; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { fs_devices->rw_devices++; list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -1350,16 +1368,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_system_alloc_bits | root->fs_info->avail_metadata_alloc_bits; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; goto out; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -2935,6 +2959,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, u64 allowed; int mixed = 0; int ret; + u64 num_devices; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -2963,10 +2988,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + BUG_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (fs_info->fs_devices->num_devices == 1) + if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (fs_info->fs_devices->num_devices < 4) + else if (num_devices < 4) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | @@ -3591,6 +3623,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; + WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -4773,6 +4806,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 58d79375dea..37d0157167b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,7 +268,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -- cgit v1.2.3 From 29a8d9a0bce6a5abac1f313400c2e189e8d10e67 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:16:24 +0100 Subject: Btrfs: introduce GET_READ_MIRRORS functionality for btrfs_map_block() Before this commit, btrfs_map_block() was called with REQ_WRITE in order to retrieve the list of mirrors for a disk block. This needs to be changed for the device replace procedure since it makes a difference whether you are asking for read mirrors or for locations to write to. GET_READ_MIRRORS is introduced as a new interface to call btrfs_map_block(). In the current commit, the functionality is not yet changed, only the interface for GET_READ_MIRRORS is introduced and all the places that should use this new interface are adapted. The reason that REQ_WRITE cannot be abused anymore to retrieve a list of read mirrors is that during a running dev replace operation all write requests to the live filesystem are duplicated to also write to the target drive. Keep in mind that the target disk is only partially a valid copy of the source disk while the operation is ongoing. All writes go to the target disk, but not all reads would return valid data on the target disk. Therefore it is not possible anymore to abuse a REQ_WRITE interface to find valid mirrors for a REQ_READ. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/reada.c | 3 ++- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/volumes.c | 8 ++++---- 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 45e7f752b64..46bd7d5f504 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS (1 << 30) + #define BTRFS_FT_UNKNOWN 0 #define BTRFS_FT_REG_FILE 1 #define BTRFS_FT_DIR 2 diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index c705a48e676..96b93daa0bb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -359,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(fs_info, REQ_WRITE, logical, &length, &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 30cbf6921c0..30ba9972489 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1193,8 +1193,8 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length, - &bbio, 0); + ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < sublen) { kfree(bbio); return -EIO; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a4e0963bf45..de0c05cca39 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4103,7 +4103,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr_end - stripe_nr_orig); stripe_index = do_div(stripe_nr, map->num_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -4115,7 +4115,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) { + if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -4129,7 +4129,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (rw & REQ_WRITE) + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) num_stripes = map->sub_stripes; else if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->sub_stripes * @@ -4242,7 +4242,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & REQ_WRITE) { + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { -- cgit v1.2.3 From 472262f35a6b3407e761b700d74c53530e5f144d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:43:46 +0100 Subject: Btrfs: changes to live filesystem are also written to replacement disk During a running dev replace operation, all write requests to the live filesystem are duplicated to also write to the target drive. Therefore btrfs_map_block() is changed to duplicate stripes that are written to the source disk of a device replace procedure to be written to the target disk as well. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index de0c05cca39..4d3bf187ab5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4044,6 +4044,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + int num_alloc_stripes; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4089,6 +4092,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!bbio_ret) goto out; + btrfs_dev_replace_lock(dev_replace); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (!dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); + num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; @@ -4155,7 +4163,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } BUG_ON(stripe_index >= map->num_stripes); - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + num_alloc_stripes = num_stripes; + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD))) + num_alloc_stripes <<= 1; + bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { ret = -ENOMEM; goto out; @@ -4250,11 +4261,48 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } + if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + dev_replace->tgtdev != NULL) { + int index_where_to_add; + u64 srcdev_devid = dev_replace->srcdev->devid; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk + * to the new disk takes place at run time while the + * filesystem is mounted writable, the regular write + * operations to the old disk have to be duplicated to go + * to the new disk as well. + * Note that device->missing is handled by the caller, and + * that the write to the old disk is already set up in the + * stripes array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + index_where_to_add++; + max_errors++; + } + } + num_stripes = index_where_to_add; + } + *bbio_ret = bbio; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; out: + if (dev_replace_is_ongoing) + btrfs_dev_replace_unlock(dev_replace); free_extent_map(em); return ret; } -- cgit v1.2.3 From 30d9861ff9520e2a112eae71029bc9f7e915a441 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:52:18 +0100 Subject: Btrfs: optionally avoid reads from device replace source drive It is desirable to be able to configure the device replace procedure to avoid reading the source drive (the one to be copied) whenever possible. This is useful when the number of read errors on this disk is high, because it would delay the copy procedure alot. Therefore there is an option to avoid reading from the source disk unless the repair procedure really needs to access it. The regular read req asks for mapping the block with mirror_num == 0, in this case the source disk is avoided whenever possible. The repair code selects the mirror_num explicitly (mirror_num != 0), this case is not changed by this commit. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4d3bf187ab5..e2e01a32710 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4007,16 +4007,37 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } -static int find_live_mirror(struct map_lookup *map, int first, int num, - int optimal) +static int find_live_mirror(struct btrfs_fs_info *fs_info, + struct map_lookup *map, int first, int num, + int optimal, int dev_replace_is_ongoing) { int i; - if (map->stripes[optimal].dev->bdev) - return optimal; - for (i = first; i < first + num; i++) { - if (map->stripes[i].dev->bdev) - return i; + int tolerance; + struct btrfs_device *srcdev; + + if (dev_replace_is_ongoing && + fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) + srcdev = fs_info->dev_replace.srcdev; + else + srcdev = NULL; + + /* + * try to avoid the drive that is the source drive for a + * dev-replace procedure, only choose it if no other non-missing + * mirror is available + */ + for (tolerance = 0; tolerance < 2; tolerance++) { + if (map->stripes[optimal].dev->bdev && + (tolerance || map->stripes[optimal].dev != srcdev)) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev && + (tolerance || map->stripes[i].dev != srcdev)) + return i; + } } + /* we couldn't find one that doesn't fail. Just return something * and the io error handling code will clean up eventually */ @@ -4116,9 +4137,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, else if (mirror_num) stripe_index = mirror_num - 1; else { - stripe_index = find_live_mirror(map, 0, + stripe_index = find_live_mirror(fs_info, map, 0, map->num_stripes, - current->pid % map->num_stripes); + current->pid % map->num_stripes, + dev_replace_is_ongoing); mirror_num = stripe_index + 1; } @@ -4147,9 +4169,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index += mirror_num - 1; else { int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(map, stripe_index, + stripe_index = find_live_mirror(fs_info, map, + stripe_index, map->sub_stripes, stripe_index + - current->pid % map->sub_stripes); + current->pid % map->sub_stripes, + dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } } else { -- cgit v1.2.3 From 72d7aefccd512b66cd5543e652eae04be12085fc Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 14:57:46 +0100 Subject: Btrfs: increase BTRFS_MAX_MIRRORS by one for dev replace This change of the define is effective in all modes, it is required and used only in the case when a device replace procedure is running. The reason is that during an active device replace procedure, the target device of the copy operation is a mirror for the filesystem data as well that can be used to read data in order to repair read errors on other disks. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 46bd7d5f504..91ff078e85d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,7 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_MAX_MIRRORS 2 +#define BTRFS_MAX_MIRRORS 3 #define BTRFS_MAX_LEVEL 8 -- cgit v1.2.3 From ad6d620e2a5704f6bf3a39c92a75aad962c51cb3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 15:06:47 +0100 Subject: Btrfs: allow repair code to include target disk when searching mirrors Make the target disk of a running device replace operation available for reading. This is only used as a last ressort for the defect repair procedure. And it is dependent on the location of the data block to read, because during an ongoing device replace operation, the target drive is only partially filled with the filesystem data. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e2e01a32710..32a4948b621 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4004,6 +4004,12 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) else ret = 1; free_extent_map(em); + + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + ret++; + btrfs_dev_replace_unlock(&fs_info->dev_replace); + return ret; } @@ -4068,6 +4074,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; int num_alloc_stripes; + int patch_the_first_stripe_for_dev_replace = 0; + u64 physical_to_patch_in_first_stripe = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4084,9 +4092,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -4118,6 +4123,88 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); + if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && + !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && + dev_replace->tgtdev != NULL) { + /* + * in dev-replace case, for repair case (that's the only + * case where the mirror is selected explicitly when + * calling btrfs_map_block), blocks left of the left cursor + * can also be read from the target drive. + * For REQ_GET_READ_MIRRORS, the target drive is added as + * the last one to the array of stripes. For READ, it also + * needs to be supported using the same mirror number. + * If the requested block is not left of the left cursor, + * EIO is returned. This can happen because btrfs_num_copies() + * returns one more in the dev-replace case. + */ + u64 tmp_length = *length; + struct btrfs_bio *tmp_bbio = NULL; + int tmp_num_stripes; + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + logical, &tmp_length, &tmp_bbio, 0); + if (ret) { + WARN_ON(tmp_bbio != NULL); + goto out; + } + + tmp_num_stripes = tmp_bbio->num_stripes; + if (mirror_num > tmp_num_stripes) { + /* + * REQ_GET_READ_MIRRORS does not contain this + * mirror, that means that the requested area + * is not left of the left cursor + */ + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + /* + * process the rest of the function using the mirror_num + * of the source drive. Therefore look it up first. + * At the end, patch the device pointer to the one of the + * target drive. + */ + for (i = 0; i < tmp_num_stripes; i++) { + if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + tmp_bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = + tmp_bbio->stripes[i].physical; + } + } + + if (found) { + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; + } else { + WARN_ON(1); + ret = -EIO; + kfree(tmp_bbio); + goto out; + } + + kfree(tmp_bbio); + } else if (mirror_num > map->num_stripes) { + mirror_num = 0; + } + num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; @@ -4188,8 +4275,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, BUG_ON(stripe_index >= map->num_stripes); num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD))) - num_alloc_stripes <<= 1; + if (dev_replace_is_ongoing) { + if (rw & (REQ_WRITE | REQ_DISCARD)) + num_alloc_stripes <<= 1; + if (rw & REQ_GET_READ_MIRRORS) + num_alloc_stripes++; + } bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { ret = -ENOMEM; @@ -4318,12 +4409,70 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; + } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + dev_replace->tgtdev != NULL) { + u64 srcdev_devid = dev_replace->srcdev->devid; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can + * also be used to read data in case it is needed to repair + * a corrupt block elsewhere. This is possible if the + * requested area is left of the left cursor. In this area, + * the target drive is a full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it + * simple, only add the mirror with the + * lowest physical address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + u64 length = map->stripe_len; + + if (physical_of_found + length <= + dev_replace->cursor_left) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + + num_stripes++; + } + } } *bbio_ret = bbio; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; + + /* + * this is the case that REQ_READ && dev_replace_is_ongoing && + * mirror_num == num_stripes + 1 && dev_replace target drive is + * available as a mirror + */ + if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { + WARN_ON(num_stripes > 1); + bbio->stripes[0].dev = dev_replace->tgtdev; + bbio->stripes[0].physical = physical_to_patch_in_first_stripe; + bbio->mirror_num = map->num_stripes + 1; + } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); -- cgit v1.2.3 From 3f6bcfbd4149875662773eb40a62294cddf215d4 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 15:08:53 +0100 Subject: Btrfs: add support for device replace ioctls This is the commit that allows to start the device replace procedure. An ioctl() interface is added that supports starting and canceling the device replace procedure, and to retrieve the status and progress. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 7 ++++--- 2 files changed, 52 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e54b5e50c92..9a71fec8615 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -55,6 +55,7 @@ #include "backref.h" #include "rcu-string.h" #include "send.h" +#include "dev-replace.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -3171,6 +3172,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_replace_args *p; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = memdup_user(arg, sizeof(*p)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (p->cmd) { + case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (atomic_xchg( + &root->fs_info->mutually_exclusive_operation_running, + 1)) { + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINPROGRESS; + } else { + ret = btrfs_dev_replace_start(root, p); + atomic_set( + &root->fs_info->mutually_exclusive_operation_running, + 0); + } + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: + btrfs_dev_replace_status(root->fs_info, p); + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: + ret = btrfs_dev_replace_cancel(root->fs_info, p); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, p, sizeof(*p))) + ret = -EFAULT; + + kfree(p); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3826,6 +3872,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_create(root, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(root, argp); + case BTRFS_IOC_DEV_REPLACE: + return btrfs_ioctl_dev_replace(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 62006ba0271..dabca9cc8c2 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; +#define BTRFS_DEVICE_PATH_NAME_MAX 1024 + #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) @@ -127,10 +129,10 @@ struct btrfs_ioctl_scrub_args { #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 struct btrfs_ioctl_dev_replace_start_params { __u64 srcdevid; /* in, if 0, use srcdev_name instead */ - __u8 srcdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ - __u8 tgtdev_name[BTRFS_PATH_NAME_MAX + 1]; /* in */ __u64 cont_reading_from_srcdev_mode; /* in, see #define * above */ + __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ + __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ }; #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 @@ -165,7 +167,6 @@ struct btrfs_ioctl_dev_replace_args { __u64 spare[64]; }; -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 struct btrfs_ioctl_dev_info_args { __u64 devid; /* in/out */ __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ -- cgit v1.2.3 From 071401258a580dec2a3e0c2700b7e76f3ed43320 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 23 Nov 2012 03:03:14 +0000 Subject: Btrfs: do not warn_on io_ctl->cur in io_ctl_map_page io_ctl_map_page is called by many functions in free-space-cache. In most scenarios, the ->cur is not null, e.g. io_ctl_add_entry. I think we'd better remove the warn_on here. Signed-off-by: Wang Sheng-Hui Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 557502ca1a2..efdd1d3f441 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl) static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) { - WARN_ON(io_ctl->cur); BUG_ON(io_ctl->index >= io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = kmap(io_ctl->page); -- cgit v1.2.3 From db2254bce4f19f458aaa05f9d00b39f413f7488c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 26 Nov 2012 02:58:36 +0000 Subject: Btrfs: fix an while-loop of listxattr If we found an invalid xattr dir item, we'd better try the next one instead. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3f4e2d69e83..e9d38405549 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -265,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(root, leaf, di)) - continue; + goto next; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; -- cgit v1.2.3 From 9a8c28bec1b40e934ed28149b7eaa7d2fafed92d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:40:43 +0000 Subject: Btrfs: pass root object into btrfs_ioctl_{start, wait}_sync() Since we have gotten the root in the caller, just pass it into btrfs_ioctl_{start, wait}_sync() directly. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a71fec8615..5022e62e63a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3058,9 +3058,9 @@ long btrfs_ioctl_trans_end(struct file *file) return 0; } -static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, + void __user *argp) { - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; struct btrfs_trans_handle *trans; u64 transid; int ret; @@ -3081,9 +3081,9 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp return 0; } -static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, + void __user *argp) { - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; u64 transid; if (argp) { @@ -3843,9 +3843,9 @@ long btrfs_ioctl(struct file *file, unsigned int btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; case BTRFS_IOC_START_SYNC: - return btrfs_ioctl_start_sync(file, argp); + return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: - return btrfs_ioctl_wait_sync(file, argp); + return btrfs_ioctl_wait_sync(root, argp); case BTRFS_IOC_SCRUB: return btrfs_ioctl_scrub(root, argp); case BTRFS_IOC_SCRUB_CANCEL: -- cgit v1.2.3 From ff7c1d33551862c86f7737fe88edc3e499d291e6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:41:29 +0000 Subject: Btrfs: don't start a new transaction when starting sync If there is no running transaction in the fs, we needn't start a new one when we want to start sync. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 14 ++++++++++---- fs/btrfs/transaction.c | 13 ++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5022e62e63a..7b1f614f51f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3065,16 +3065,22 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + + /* No running transaction, don't bother */ + transid = root->fs_info->last_trans_committed; + goto out; + } transid = trans->transid; ret = btrfs_commit_transaction_async(trans, root, 0); if (ret) { btrfs_end_transaction(trans, root); return ret; } - +out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) return -EFAULT; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bcc6b65be3b..8db401fa2f8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1307,9 +1307,10 @@ static void do_async_commit(struct work_struct *work) * We've got freeze protection passed with the transaction. * Tell lockdep about it. */ - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + if (ac->newtrans->type < TRANS_JOIN_NOLOCK) + rwsem_acquire_read( + &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); current->journal_info = ac->newtrans; @@ -1347,8 +1348,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ - rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + if (trans->type < TRANS_JOIN_NOLOCK) + rwsem_release( + &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); schedule_delayed_work(&ac->work, 0); -- cgit v1.2.3 From 8cd2807f79b73ef2d8c1cb6b3732dc5758ac7212 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:42:07 +0000 Subject: Btrfs: fix wrong return value of btrfs_wait_for_commit() If the id of the existed transaction is more than the one we specified, it means the specified transaction was commited, so we should return 0, not EINVAL. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8db401fa2f8..e6509b92433 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -456,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root, int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) { struct btrfs_transaction *cur_trans = NULL, *t; - int ret; + int ret = 0; - ret = 0; if (transid) { if (transid <= root->fs_info->last_trans_committed) goto out; + ret = -EINVAL; /* find specified transaction */ spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; atomic_inc(&cur_trans->use_count); + ret = 0; break; } - if (t->transid > transid) + if (t->transid > transid) { + ret = 0; break; + } } spin_unlock(&root->fs_info->trans_lock); - ret = -EINVAL; + /* The specified transaction doesn't exist */ if (!cur_trans) - goto out; /* bad transid */ + goto out; } else { /* find newest transaction that is committing | committed */ spin_lock(&root->fs_info->trans_lock); @@ -497,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } wait_for_commit(root, cur_trans); - put_transaction(cur_trans); - ret = 0; out: return ret; } -- cgit v1.2.3 From 3c04ce01053413007b9df88313b8b8e17272b57b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:43:07 +0000 Subject: Btrfs: get write access when setting the default subvolume When wen want to set the default subvolume, we must get write access, or we will change the R/O file system. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7b1f614f51f..10bc65ed736 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2843,12 +2843,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_disk_key disk_key; u64 objectid = 0; u64 dir_id; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&objectid, argp, sizeof(objectid))) - return -EFAULT; + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) { + ret = -EFAULT; + goto out; + } if (!objectid) objectid = root->root_key.objectid; @@ -2858,21 +2865,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - if (IS_ERR(new_root)) - return PTR_ERR(new_root); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out; + } - if (btrfs_root_refs(&new_root->root_item) == 0) - return -ENOENT; + if (btrfs_root_refs(&new_root->root_item) == 0) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); - return PTR_ERR(trans); + ret = PTR_ERR(trans); + goto out; } dir_id = btrfs_super_root_dir(root->fs_info->super_copy); @@ -2883,7 +2897,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_end_transaction(trans, root); printk(KERN_ERR "Umm, you don't have the default dir item, " "this isn't going to work\n"); - return -ENOENT; + ret = -ENOENT; + goto out; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); @@ -2893,8 +2908,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans, root); - - return 0; +out: + mnt_drop_write_file(file); + return ret; } void btrfs_get_block_group_info(struct list_head *groups_list, -- cgit v1.2.3 From 198605a8e2077f174c9834c97b836f535e4e56dd Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:43:45 +0000 Subject: Btrfs: get write access when doing resize fs Steps to reproduce: # mkfs.btrfs # mount -o ro # mount -o ro # mount -o remount,rw # umount # btrfs fi resize 10g We re-sized a R/O filesystem. The reason is that we just check the R/O flag of the super block object. It is not enough, because the kernel may set the R/O flag only for the mount point. We need invoke mnt_want_write_file() to do a full check. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 10bc65ed736..2be49b4c82d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1298,12 +1298,13 @@ out_ra: return ret; } -static noinline int btrfs_ioctl_resize(struct btrfs_root *root, +static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { u64 new_size; u64 old_size; u64 devid = 1; + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; @@ -1318,6 +1319,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + ret = mnt_want_write_file(file); + if (ret) + return ret; + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); @@ -1425,6 +1430,7 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3832,7 +3838,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEFRAG_RANGE: return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, argp); + return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: -- cgit v1.2.3 From da24927b1e1925da5c1885cb483231dabe027e15 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:44:50 +0000 Subject: Btrfs: get write access when removing a device Steps to reproduce: # mkfs.btrfs -d single -m single # mount -o ro # mount -o ro # mount -o remount,rw # umount # btrfs device delete We can remove a device from a R/O filesystem. The reason is that we just check the R/O flag of the super block object. It is not enough, because the kernel may set the R/O flag only for the mount point. We need invoke mnt_want_write_file() to do a full check. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2be49b4c82d..ee36009f8aa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2270,20 +2270,23 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } @@ -2300,6 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); return ret; } @@ -3842,7 +3846,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, argp); + return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: -- cgit v1.2.3 From b8e95489bf0ddf767e4bd38f537e0adad16ee830 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:48:01 +0000 Subject: Btrfs: get write access for scrub We need get write access for scrub, or we will modify the R/O fs. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee36009f8aa..12b18c01b91 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3127,10 +3127,11 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, return btrfs_wait_for_commit(root, transid); } -static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - int ret; + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_scrub_args *sa; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3139,6 +3140,12 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) + goto out; + } + ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); @@ -3146,6 +3153,9 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; + if (!(sa->flags & BTRFS_SCRUB_READONLY)) + mnt_drop_write_file(file); +out: kfree(sa); return ret; } @@ -3879,7 +3889,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_WAIT_SYNC: return btrfs_ioctl_wait_sync(root, argp); case BTRFS_IOC_SCRUB: - return btrfs_ioctl_scrub(root, argp); + return btrfs_ioctl_scrub(file, argp); case BTRFS_IOC_SCRUB_CANCEL: return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: -- cgit v1.2.3 From 905b0dda06a064db08b8a814e968786ff3c4cc19 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 08:50:11 +0000 Subject: Btrfs: get write access for qgroup operations We need get write access for qgroup operations, or we will modify the R/O fs. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 73 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 25 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 12b18c01b91..657d83ca9de 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3558,8 +3558,9 @@ out: return ret; } -static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_quota_ctl_args *sa; struct btrfs_trans_handle *trans = NULL; int ret; @@ -3568,12 +3569,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { trans = btrfs_start_transaction(root, 2); @@ -3606,14 +3610,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) if (err && !ret) ret = err; } - out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3622,12 +3628,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3650,11 +3659,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3663,12 +3675,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3690,11 +3705,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -3704,12 +3722,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + goto drop_write; + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3732,6 +3753,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) out: kfree(sa); +drop_write: + mnt_drop_write_file(file); return ret; } @@ -3907,13 +3930,13 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp); case BTRFS_IOC_QUOTA_CTL: - return btrfs_ioctl_quota_ctl(root, argp); + return btrfs_ioctl_quota_ctl(file, argp); case BTRFS_IOC_QGROUP_ASSIGN: - return btrfs_ioctl_qgroup_assign(root, argp); + return btrfs_ioctl_qgroup_assign(file, argp); case BTRFS_IOC_QGROUP_CREATE: - return btrfs_ioctl_qgroup_create(root, argp); + return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: - return btrfs_ioctl_qgroup_limit(root, argp); + return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); } -- cgit v1.2.3 From 9247f3170b2c3d648707c93bbebcd763fac17c06 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:24:43 +0000 Subject: Btrfs: use slabs for auto defrag allocation The auto defrag allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/file.c | 28 ++++++++++++++++++++++++---- fs/btrfs/super.c | 9 ++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 91ff078e85d..389c05715ea 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3505,6 +3505,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); /* file.c */ +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bd7f1b01e05..15117eae85c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,7 @@ #include "compat.h" #include "volumes.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; /* * when auto defrag is enabled we * queue up these defrag structs to remember which @@ -127,7 +128,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, return; exists: - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return; } @@ -157,7 +158,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, else transid = BTRFS_I(inode)->root->last_trans; - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return -ENOMEM; @@ -169,7 +170,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else - kfree(defrag); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } @@ -315,7 +316,8 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) next: spin_lock(&fs_info->defrag_inodes_lock); next_free: - kfree(defrag); + if (defrag) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); @@ -2293,3 +2295,21 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_ioctl, #endif }; + +void btrfs_auto_defrag_exit(void) +{ + if (btrfs_inode_defrag_cachep) + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index def4f24b58d..99545df1b86 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1680,10 +1680,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_ordered_data; - err = btrfs_interface_init(); + err = btrfs_auto_defrag_init(); if (err) goto free_delayed_inode; + err = btrfs_interface_init(); + if (err) + goto free_auto_defrag; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1695,6 +1699,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_auto_defrag: + btrfs_auto_defrag_exit(); free_delayed_inode: btrfs_delayed_inode_exit(); free_ordered_data: @@ -1714,6 +1720,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); extent_map_exit(); -- cgit v1.2.3 From 8ddc473433b5e8ce8693db9f6e251f5a28267528 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:25:38 +0000 Subject: Btrfs: fix unprotected defragable inode insertion We forget to get the defrag lock when we re-add the defragable inode, Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 70 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15117eae85c..00918321e39 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -91,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, * If an existing record is found the defrag item you * pass in is freed */ -static void __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -119,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode, entry->transid = defrag->transid; if (defrag->last_offset > entry->last_offset) entry->last_offset = defrag->last_offset; - goto exists; + return -EEXIST; } } set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return; + return 0; +} -exists: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return; +static inline int __need_auto_defrag(struct btrfs_root *root) +{ + if (!btrfs_test_opt(root, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(root->fs_info)) + return 0; + return 1; } /* @@ -143,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; u64 transid; + int ret; - if (!btrfs_test_opt(root, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(root->fs_info)) + if (!__need_auto_defrag(root)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) @@ -167,14 +171,50 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) - __btrfs_add_inode_defrag(inode, defrag); - else + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } spin_unlock(&root->fs_info->defrag_inodes_lock); return 0; } +/* + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue. + */ +void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (!__need_auto_defrag(root)) + goto out; + + /* + * Here we don't check the IN_DEFRAG flag, because we need merge + * them together. + */ + spin_lock(&root->fs_info->defrag_inodes_lock); + ret = __btrfs_add_inode_defrag(inode, defrag); + spin_unlock(&root->fs_info->defrag_inodes_lock); + if (ret) + goto out; + return; +out: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + /* * must be called with the defrag_inodes lock held */ @@ -294,7 +334,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ if (num_defrag == defrag_batch) { defrag->last_offset = range.start; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); /* * we don't want to kfree defrag, we added it back to * the rbtree @@ -308,7 +348,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) */ defrag->last_offset = 0; defrag->cycled = 1; - __btrfs_add_inode_defrag(inode, defrag); + btrfs_requeue_inode_defrag(inode, defrag); defrag = NULL; } -- cgit v1.2.3 From 26176e7c2aa923327becdc25b5aca2cb907ac932 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:26:20 +0000 Subject: Btrfs: restructure btrfs_run_defrag_inodes() This patch restructure btrfs_run_defrag_inodes() and make the code of the auto defragment more readable. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 +- fs/btrfs/file.c | 197 +++++++++++++++++++++++++++++------------------------ 3 files changed, 109 insertions(+), 91 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 389c05715ea..6ba56aea5b6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3510,6 +3510,7 @@ void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 76b82506bf9..3229531af8c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3329,7 +3329,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(fs_info); + btrfs_cleanup_defrag_inodes(fs_info); if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 00918321e39..3c6f7479cd5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -216,11 +216,11 @@ out: } /* - * must be called with the defrag_inodes lock held + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, - u64 root, u64 ino, - struct rb_node **next) +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) { struct inode_defrag *entry = NULL; struct inode_defrag tmp; @@ -231,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, tmp.ino = ino; tmp.root = root; - p = info->defrag_inodes.rb_node; + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -242,52 +243,128 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, else if (ret > 0) p = parent->rb_right; else - return entry; + goto out; } - if (next) { - while (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); - } - *next = parent; + else + entry = NULL; } - return NULL; +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; } -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + if (need_resched()) { + spin_unlock(&fs_info->defrag_inodes_lock); + cond_resched(); + spin_lock(&fs_info->defrag_inodes_lock); + } + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ struct btrfs_root *inode_root; struct inode *inode; - struct rb_node *n; struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; - u64 first_ino = 0; - u64 root_objectid = 0; int num_defrag; - int defrag_batch = 1024; + /* get the inode */ + key.objectid = defrag->root; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(inode_root)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode_root); + } + + key.objectid = defrag->ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + if (IS_ERR(inode)) { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return PTR_ERR(inode); + } + + /* do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; + range.start = defrag->last_offset; + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + /* + * if we filled the whole defrag batch, there + * must be more work to do. Queue this defrag + * again + */ + if (num_defrag == BTRFS_DEFRAG_BATCH) { + defrag->last_offset = range.start; + btrfs_requeue_inode_defrag(inode, defrag); + } else if (defrag->last_offset && !defrag->cycled) { + /* + * we didn't fill our defrag batch, but + * we didn't start at zero. Make sure we loop + * around to the start of the file. + */ + defrag->last_offset = 0; + defrag->cycled = 1; + btrfs_requeue_inode_defrag(inode, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + + iput(inode); + return 0; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; atomic_inc(&fs_info->defrag_running); - spin_lock(&fs_info->defrag_inodes_lock); while(1) { - n = NULL; + if (!__need_auto_defrag(fs_info->tree_root)) + break; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, root_objectid, - first_ino, &n); + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, + first_ino); if (!defrag) { - if (n) { - defrag = rb_entry(n, struct inode_defrag, - rb_node); - } else if (root_objectid || first_ino) { + if (root_objectid || first_ino) { root_objectid = 0; first_ino = 0; continue; @@ -296,71 +373,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } } - /* remove it from the rbtree */ first_ino = defrag->ino + 1; root_objectid = defrag->root; - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - - if (btrfs_fs_closing(fs_info)) - goto next_free; - spin_unlock(&fs_info->defrag_inodes_lock); - - /* get the inode */ - key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = (u64)-1; - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(inode_root)) - goto next; - - key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); - if (IS_ERR(inode)) - goto next; - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - range.start = defrag->last_offset; - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - defrag_batch); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == defrag_batch) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(inode, defrag); - /* - * we don't want to kfree defrag, we added it back to - * the rbtree - */ - defrag = NULL; - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(inode, defrag); - defrag = NULL; - } - - iput(inode); -next: - spin_lock(&fs_info->defrag_inodes_lock); -next_free: - if (defrag) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + __btrfs_run_defrag_inode(fs_info, defrag); } - spin_unlock(&fs_info->defrag_inodes_lock); - atomic_dec(&fs_info->defrag_running); /* -- cgit v1.2.3 From b66f00da0cfceb856c17706b77906b63437f6fda Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:27:29 +0000 Subject: Btrfs: fix freeze vs auto defrag If we freeze the fs, the auto defragment should not run. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3c6f7479cd5..d415a052ca9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -318,8 +318,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = defrag->last_offset; + + sb_start_write(fs_info->sb); num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); /* * if we filled the whole defrag batch, there * must be more work to do. Queue this defrag -- cgit v1.2.3 From cb3806ec88a7e1e9d1fbde34cbc0bf153b7e7c3f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 16:10:21 +0000 Subject: Btrfs: fix race in check-integrity caused by usage of bitfield The structure member mirror_num is modified concurrently to the structure member is_iodone. This doesn't require any locking by design, unless everything is stored in the same 32 bits of a bit field. This was the case and xfstest 284 was able to trigger false warnings from the checker code. This patch seperates the bits and fixes the race. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index badc6f141b6..11d47bfb62b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -137,7 +137,7 @@ struct btrfsic_block { unsigned int never_written:1; /* block was added because it was * referenced, not because it was * written */ - unsigned int mirror_num:2; /* large enough to hold + unsigned int mirror_num; /* large enough to hold * BTRFS_SUPER_MIRROR_MAX */ struct btrfsic_dev_state *dev_state; u64 dev_bytenr; /* key, physical byte num on disk */ -- cgit v1.2.3 From f9c83748deb94aba89b5c1085f887ddcdab223ef Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 17:39:50 +0000 Subject: Btrfs: fix a build warning for an unused label This issue was detected by the "0-DAY kernel build testing". fs/btrfs/volumes.c: In function 'btrfs_rm_device': fs/btrfs/volumes.c:1505:1: warning: label 'error_close' defined but not used [-Wunused-label] Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 32a4948b621..0b1b7e22e7e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1544,7 +1544,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) error_brelse: brelse(bh); -error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: -- cgit v1.2.3 From af1be4f851db4f4975f0139211a6561776ef37c0 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Nov 2012 17:39:51 +0000 Subject: Btrfs: fix a scrub regression in case of write errors This regression was introduced by the device-replace patches. Scrub immediately stops checking those disks that have write errors. This is nothing that happens in the real world, but it is wrong since scrub is the tool to detect and repair defects. Fix it. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 30ba9972489..8db6a6413a5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2657,7 +2657,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (atomic64_read(&dev_replace->num_write_errors) > 0) { + if (is_dev_replace && + atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; } -- cgit v1.2.3 From 797f4277113bff142b6c64a55abaef64d7d67d5c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:07 +0000 Subject: Btrfs: use existing align macros in btrfs_allocate() The kernel developers have implemented some often-used align macros, we should use them instead of the complex code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d415a052ca9..a43d0aef6ee 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2054,12 +2054,12 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; + alloc_start = round_down(offset, blocksize); + alloc_end = round_up(offset + len, blocksize); /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2140,7 +2140,7 @@ static long btrfs_fallocate(struct file *file, int mode, } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && -- cgit v1.2.3 From 0ff6fabdb0a862b22df4dd75873578392478e64d Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 28 Nov 2012 10:28:54 +0000 Subject: Btrfs: fix off-by-one error of the reserved size of btrfs_allocate() alloc_end is not the real end of the current extent, it is the start of the next adjoining extent. So we needn't +1 when calculating the size the space that is about to be reserved. Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a43d0aef6ee..8e3d6788d6d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2072,7 +2072,7 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; @@ -2179,7 +2179,7 @@ static long btrfs_fallocate(struct file *file, int mode, out: mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; } -- cgit v1.2.3 From 755ac67f83e515af55adbfe55134eb7d90839cdb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 28 Nov 2012 10:43:11 +0000 Subject: Btrfs: skip adding an acl attribute if we don't have to If the acl can be exactly represented in the traditional file mode permission bits, we don't set another acl attribute. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0c16e3dbfd5..e15d2b0d8d3 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; + if (ret == 0) + acl = NULL; } ret = 0; break; -- cgit v1.2.3 From 01e6deb25ae11e7b85484bf5e550eb540c50c63e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 28 Nov 2012 10:43:12 +0000 Subject: Btrfs: don't add a NULL extended attribute Passing a null extended attribute value means to remove the attribute, but we don't have to add a new NULL extended attribute. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/xattr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e9d38405549..aef6bb3c5f5 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans, */ if (!value) goto out; + } else { + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + if (!di && !value) + goto out; + btrfs_release_path(path); } again: -- cgit v1.2.3 From 05dadc09f52ad5a631da1aa8767c5b80e121f0c4 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 29 Nov 2012 05:08:26 +0000 Subject: Btrfs: add fiemap's flag check When the flag not supported is specified, it is necessary to return the error to the caller. So, we add the validity check of the fiemap's flag. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d7bf2e7ee8a..a1761f01cf1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6595,9 +6595,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, btrfs_submit_direct, 0); } +#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { + int ret; + + ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + if (ret) + return ret; + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } -- cgit v1.2.3 From 2794ed013b3551cbae887ea1b93c52aaacb7370d Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:08 +0000 Subject: Btrfs: fix permissions of empty files not affected by umask When a new file is created with btrfs_create(), the inode will initially be created with permissions 0666 and later on in btrfs_init_acl() it will be adapted to mask out the umask bits. The problem is that this change won't make it into the btrfs_inode unless there's another change to the inode (e.g. writing content changing the size or touching the file changing the mtime.) This fix adds a call to btrfs_update_inode() to btrfs_create() to make sure that the change will not get lost if the in-memory inode is flushed before other changes are made to the file. Signed-off-by: Filipe Brandenburger Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a1761f01cf1..adab791e1ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4951,6 +4951,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } + err = btrfs_update_inode(trans, root, inode); + if (err) { + drop_inode = 1; + goto out_unlock; + } + /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v1.2.3 From 43baa579b3b1f059f68c51ef754ec59c87a35745 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:09 +0000 Subject: Btrfs: refactor error handling to drop inode in btrfs_create() Refactor it by checking whether the inode has been created and needs to be dropped (drop_inode_on_err) and also if the err variable is set. That way the variable doesn't need to be set on each and every error handling block. Signed-off-by: Filipe Brandenburger Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index adab791e1ce..657f16d9c78 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4989,7 +4989,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int drop_inode = 0; + int drop_inode_on_err = 0; int err; u64 objectid; u64 index = 0; @@ -5014,12 +5014,11 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); goto out_unlock; } + drop_inode_on_err = 1; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; + if (err) goto out_unlock; - } /* * If the active LSM wants to access the inode during @@ -5032,16 +5031,16 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - d_instantiate(dentry, inode); - } + goto out_unlock; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); + out_unlock: btrfs_end_transaction(trans, root); - if (drop_inode) { + if (err && drop_inode_on_err) { inode_dec_link_count(inode); iput(inode); } -- cgit v1.2.3 From 960097622d48bf0ee8f6c0cf751a904066c4b45b Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 30 Nov 2012 06:30:14 +0000 Subject: Btrfs: use ctl->unit for free space calculation instead of block_group->sectorsize We should use ctl->unit for free space calculation instead of block_group->sectorsize even though for free space use_bitmap or free space cluster we only have sectorsize assigned to ctl->unit currently. Also, we can keep it consisten in code style. Signed-off-by: Wang Sheng-Hui Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index efdd1d3f441..59ea2e4349c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1353,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); BUG_ON(ctl->total_bitmaps > max_bitmaps); @@ -1639,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * some block groups are so tiny they can't be enveloped by a bitmap, so * don't even bother to create a bitmap for this */ - if (BITS_PER_BITMAP * block_group->sectorsize > - block_group->key.offset) + if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) return false; return true; @@ -2287,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long total_found = 0; int ret; - i = offset_to_bit(entry->offset, block_group->sectorsize, + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); - want_bits = bytes_to_bits(bytes, block_group->sectorsize); - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, ctl->unit); + min_bits = bytes_to_bits(min_bytes, ctl->unit); again: found_bits = 0; @@ -2314,23 +2313,22 @@ again: total_found += found_bits; - if (cluster->max_size < found_bits * block_group->sectorsize) - cluster->max_size = found_bits * block_group->sectorsize; + if (cluster->max_size < found_bits * ctl->unit) + cluster->max_size = found_bits * ctl->unit; if (total_found < want_bits || cluster->max_size < cont1_bytes) { i = next_zero + 1; goto again; } - cluster->window_start = start * block_group->sectorsize + - entry->offset; + cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); BUG_ON(ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, - total_found * block_group->sectorsize, 1); + total_found * ctl->unit, 1); return 0; } -- cgit v1.2.3 From 543eabd5e1929bc73e22b279aa911eb01447535f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:52:48 +0000 Subject: Btrfs: don't auto defrag a file when doing directIO If we runt the direct IO, we should not run auto defrag, because it may introduce buffered IO vs direcIO problem, and make direct IO slow down. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 657f16d9c78..bf609581c5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5692,9 +5692,6 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (IS_ERR(trans)) return ERR_CAST(trans); - if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) - btrfs_add_inode_defrag(trans, inode); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; alloc_hint = get_extent_allocation_hint(inode, start, len); -- cgit v1.2.3 From 4b5829a8e3104c8bc115d926a0285d3ff9bcfc77 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:25 +0000 Subject: Btrfs: fix missing reserved space release in error path of delalloc reservation We forget to release the reserved space in the error path of delalloc reservatiom, fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 98af8379895..e1528098918 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4559,6 +4559,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); if (ret) { + spin_lock(&BTRFS_I(inode)->lock); + calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4594,6 +4597,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); } + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + } mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } -- cgit v1.2.3 From 6347b3c433a4cff00eb2299c7f2c7d1d8b24c1fc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:53:45 +0000 Subject: Btrfs: fix off-by-one error of the same page check in btrfs_punch_hole() (start + len) is the start of the adjacent extent, not the end of the current extent, so we should not use it to check the hole is on the same page or not. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8e3d6788d6d..d75412bf7c4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1867,8 +1867,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - bool same_page = (offset >> PAGE_CACHE_SHIFT) == - ((offset + len) >> PAGE_CACHE_SHIFT); + bool same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); btrfs_wait_ordered_range(inode, offset, len); -- cgit v1.2.3 From 0061280d2c7240805cfd7b1f493da967c97c2f34 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:12 +0000 Subject: Btrfs: fix the page that is beyond EOF Steps to reproduce: # mkfs.btrfs # mount # dd if=/dev/zero of=/ bs=512 seek=5 count=8 # fallocate -p -o 2048 -l 16384 / # dd if=/dev/zero of=/ bs=4096 seek=3 count=8 conv=notrunc,nocreat # umount # dmesg WARNING: at fs/btrfs/inode.c:7140 btrfs_destroy_inode+0x2eb/0x330 The reason is that we inputed a range which is beyond the end of the file. And because the end of this range was not page-aligned, we had to truncate the last page in this range, this operation is similar to a buffered file write. In other words, we reserved enough space and clear the data which was in the hole range on that page. But when we expanded that test file, write the data into the same page, we forgot that we have reserved enough space for the buffered write of that page because in most cases there is no page that is beyond the end of the file. As a result, we reserved the space twice. In fact, we needn't truncate the page if it is beyond the end of the file, just release the allocated space in that range. Fix the above problem by this way. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d75412bf7c4..700ffd266da 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1859,9 +1859,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - u64 lockstart = (offset + mask) & ~mask; - u64 lockend = ((offset + len) & ~mask) - 1; + u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + u64 lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; u64 cur_offset = lockstart; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; @@ -1896,10 +1896,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } /* zero the front end of the last page */ - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset + len, 0, 1); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } if (lockend < lockstart) { -- cgit v1.2.3 From 7426cc04d407621773af3a0403e57642e40c36bf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:54:52 +0000 Subject: Btrfs: punch hole past the end of the file Since we can pre-allocate the space past EOF, we should be able to reclaim that space if we need. This patch implements it by removing the EOF check. Though the manual of fallocate command says we can use truncate command to reclaim the pre-allocated space which past EOF, but because truncate command changes the file size, we must run several commands to reclaim the space if we don't want to change the file size, so it is not a good choice. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/file.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 700ffd266da..71c2dc1ea15 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1873,26 +1873,28 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_wait_ordered_range(inode, offset, len); mutex_lock(&inode->i_mutex); - if (offset >= inode->i_size) { - mutex_unlock(&inode->i_mutex); - return 0; - } - + /* + * We needn't truncate any page which is beyond the end of the file + * because we are sure there is no data there. + */ /* * Only do this if we are in the same page and we aren't doing the * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - ret = btrfs_truncate_page(inode, offset, len, 0); + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + ret = btrfs_truncate_page(inode, offset, len, 0); mutex_unlock(&inode->i_mutex); return ret; } /* zero back part of the first page */ - ret = btrfs_truncate_page(inode, offset, 0, 0); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + ret = btrfs_truncate_page(inode, offset, 0, 0); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } } /* zero the front end of the last page */ -- cgit v1.2.3 From ac6a2b36f9fcfbe4865550afb6d333dec6b57578 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 5 Dec 2012 10:56:13 +0000 Subject: Btrfs: fix wrong return value of btrfs_truncate_page() ret variant may be set to 0 if we read page successfully, but it might be released before we lock it again. On this case, if we fail to allocate a new page, we will return 0, it is wrong, fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf609581c5d..0446cbe8bca 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3521,11 +3521,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if (ret) goto out; - ret = -ENOMEM; again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + ret = -ENOMEM; goto out; } @@ -3574,7 +3574,6 @@ again: goto out_unlock; } - ret = 0; if (offset != PAGE_CACHE_SIZE) { if (!len) len = PAGE_CACHE_SIZE - offset; -- cgit v1.2.3 From b8b8ff590f99678616f9ea85f5088542d1cfc0be Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 6 Dec 2012 19:25:48 +0000 Subject: btrfs: Notify udev when removing device Currently udev does not know about the device being removed from the file system. This may result in the situation where we're unable to mount the file system by UUID or by LABEL because the by-uuid and by-label links may still point to the device which is no longer part of the btrfs file system and hence does not have any btrfs super block. It can be easily reproduced by the following: mkfs.btrfs -L bugfs /dev/loop[0-6] mount /dev/loop0 /mnt/test btrfs device delete /dev/loop0 /mnt/test umount /mnt/test mount LABEL=bugfs /mnt/test <---- this fails then see: ls -l /dev/disk/by-label/bugfs which will still point to the /dev/loop0 We did not noticed this before because libblkid would send the udev event for us when it notice that the link does not fit the reality, however it does not do that anymore and completely relies on udev information. Fix this by sending the KOBJ_CHANGE event to the bdev kobject after successful device removal. Note that this does not affect device addition, because we will open the device prior the addition from userspace and udev will notice that and reread the device afterwards. Signed-off-by: Lukas Czerner Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0b1b7e22e7e..886f4ba0f71 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -72,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } +static void btrfs_kobject_uevent(struct block_device *bdev, + enum kobject_action action) +{ + int ret; + + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", + action, + kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} + void btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -1542,6 +1555,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + error_brelse: brelse(bh); if (bdev) -- cgit v1.2.3 From 5f3ab90a72f98adbf00c50ac2d4d2b47cf4a9685 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 7 Dec 2012 09:28:54 +0000 Subject: Btrfs: rename root_times_lock to root_item_lock Originally root_times_lock was introduced as part of send/receive code however newly developed patch to label the subvol reused the same lock, so renaming it for a meaningful name. Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/root-tree.c | 4 ++-- fs/btrfs/send.c | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5c2cf992e71..01efcbc80df 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5114,13 +5114,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_times_lock); + spin_lock(&left_root->root_item_lock); left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_times_lock); + spin_unlock(&left_root->root_item_lock); - spin_lock(&right_root->root_times_lock); + spin_lock(&right_root->root_item_lock); right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_times_lock); + spin_unlock(&right_root->root_item_lock); trans = btrfs_join_transaction(left_root); if (IS_ERR(trans)) { @@ -5215,15 +5215,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - spin_lock(&left_root->root_times_lock); + spin_lock(&left_root->root_item_lock); ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_times_lock); + spin_unlock(&left_root->root_item_lock); if (ctransid != left_start_ctransid) left_start_ctransid = 0; - spin_lock(&right_root->root_times_lock); + spin_lock(&right_root->root_item_lock); ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_times_lock); + spin_unlock(&right_root->root_item_lock); if (ctransid != right_start_ctransid) right_start_ctransid = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6ba56aea5b6..313a6adfde5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1645,7 +1645,7 @@ struct btrfs_root { int force_cow; - spinlock_t root_times_lock; + spinlock_t root_item_lock; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3229531af8c..faf182691b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1204,7 +1204,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->root_key.objectid = objectid; root->anon_dev = 0; - spin_lock_init(&root->root_times_lock); + spin_lock_init(&root->root_item_lock); } static int __must_check find_and_setup_root(struct btrfs_root *tree_root, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index eb923d087da..668af537a3e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root_item *item = &root->root_item; struct timespec ct = CURRENT_TIME; - spin_lock(&root->root_times_lock); + spin_lock(&root->root_item_lock); item->ctransid = cpu_to_le64(trans->transid); item->ctime.sec = cpu_to_le64(ct.tv_sec); item->ctime.nsec = cpu_to_le32(ct.tv_nsec); - spin_unlock(&root->root_times_lock); + spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e78b297b0b0..54454542ad4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx) if (!path) return -ENOMEM; - spin_lock(&send_root->root_times_lock); + spin_lock(&send_root->root_item_lock); start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_times_lock); + spin_unlock(&send_root->root_item_lock); key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -4422,9 +4422,9 @@ join_trans: * Make sure the tree has not changed after re-joining. We detect this * by comparing start_ctransid and ctransid. They should always match. */ - spin_lock(&send_root->root_times_lock); + spin_lock(&send_root->root_item_lock); ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_times_lock); + spin_unlock(&send_root->root_item_lock); if (ctransid != start_ctransid) { WARN(1, KERN_WARNING "btrfs: the root that you're trying to " -- cgit v1.2.3 From e99761514999f64aff1985460967f93d9e8417f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 15:53:56 -0400 Subject: Btrfs: only log the inode item if we can get away with it Currently we copy all the file information into the log, inode item, the refs, xattrs etc. Except most of this doesn't change from fsync to fsync, just the inode item changes. So set a flag if an xattr changes or a link is added, and otherwise only log the inode item. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/tree-log.c | 10 ++++++++-- fs/btrfs/xattr.c | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ed8ca7ca5ef..2411baf3522 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -39,6 +39,7 @@ #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 +#define BTRFS_INODE_COPY_EVERYTHING 8 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0446cbe8bca..123815f3b45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5083,6 +5083,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 40b9efd20e4..f05fca778cb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3429,14 +3429,20 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = BTRFS_INODE_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, - BTRFS_XATTR_ITEM_KEY); + max_key.type); } } if (ret) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index aef6bb3c5f5..446a6848c55 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -208,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; + set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: -- cgit v1.2.3 From a95249b392c3ab843d7b25ab6817ecc9ea0b82ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:17:34 -0400 Subject: Btrfs: don't bother copying if we're only logging the inode We don't copy inode items anwyay, we just copy them straight into the log from the in memory inode. So if we know we're only logging the inode, don't bother dropping anything, just try to insert it and either if it succeeds or we get EEXIST we can update the inode item in the log and carry on. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f05fca778cb..ab7168ee618 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2996,6 +2996,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, } +static int log_inode_item(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct btrfs_path *path, + struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_key key; + int ret; + + memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*inode_item)); + if (ret && ret != -EEXIST) + return ret; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + btrfs_release_path(path); + return 0; +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, @@ -3433,17 +3453,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); - } else { + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_ALL) fast_search = true; - if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) - max_key.type = BTRFS_XATTR_ITEM_KEY; - else - max_key.type = BTRFS_INODE_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); + } else { + if (inode_only == LOG_INODE_ALL) + fast_search = true; + ret = log_inode_item(trans, log, dst_path, inode); + if (ret) { + err = ret; + goto out_unlock; + } + goto log_extents; } + } if (ret) { err = ret; @@ -3522,6 +3549,7 @@ next_slot: ins_nr = 0; } +log_extents: if (fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); -- cgit v1.2.3 From b812ce28796f746f14ba6cc451250c422db447b2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Nov 2012 13:56:32 -0500 Subject: Btrfs: inline csums if we're fsyncing The tree logging stuff needs the csums to be on the ordered extents in order to log them properly, so mark that we're sync and inline the csum creation so we don't have to wait on the csumming to be done when logging extents that are still in flight. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 3 +++ fs/btrfs/file.c | 8 ++++++++ fs/btrfs/inode.c | 11 ++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2411baf3522..2a8c242bc4f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -91,6 +91,9 @@ struct btrfs_inode { unsigned long runtime_flags; + /* Keep track of who's O_SYNC/fsycing currently */ + atomic_t sync_writers; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 71c2dc1ea15..7f4654a1520 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1472,6 +1472,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; + bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); sb_start_write(inode->i_sb); @@ -1529,6 +1530,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } } + if (sync) + atomic_inc(&BTRFS_I(inode)->sync_writers); + if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, pos, ppos, count, ocount); @@ -1563,6 +1567,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + if (sync) + atomic_dec(&BTRFS_I(inode)->sync_writers); sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -1613,7 +1619,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. */ + atomic_inc(&BTRFS_I(inode)->sync_writers); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 123815f3b45..7855aac3670 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1622,6 +1622,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; int metadata = 0; + int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -1644,7 +1645,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto out; } goto mapit; - } else if (!skip_sum) { + } else if (async && !skip_sum) { /* csum items have already been cloned */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; @@ -1655,6 +1656,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btrfs_submit_bio_start, __btrfs_submit_bio_done); goto out; + } else if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + if (ret) + goto out; } mapit: @@ -6333,6 +6338,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + if (async_submit) + async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); + bio_get(bio); if (!write) { @@ -7113,6 +7121,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); ei->io_tree.track_uptodate = 1; ei->io_failure_tree.track_uptodate = 1; + atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); -- cgit v1.2.3 From b493968096944a11422c4d80fb87af537ca1cac7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:31:19 -0500 Subject: Btrfs: keep track of the extents original block length If we've written to a prealloc extent we need to know the original block len for the extent. We can't figure this out currently since ->block_len is just set to the extent length. So introduce ->orig_block_len so that we know how many bytes were in the original extent for proper extent logging that future patches will need. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.h | 1 + fs/btrfs/file.c | 5 +++++ fs/btrfs/inode.c | 22 ++++++++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 679225555f7..99a0dcb5ba2 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -24,6 +24,7 @@ struct extent_map { u64 mod_start; u64 mod_len; u64 orig_start; + u64 orig_block_len; u64 block_start; u64 block_len; u64 generation; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7f4654a1520..6810145f4e9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -588,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -609,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; + split->orig_block_len = max(em->block_len, + em->orig_block_len); if (compressed) { split->block_len = em->block_len; @@ -1838,6 +1842,7 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7855aac3670..bfd59bcc50d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -699,6 +699,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -886,6 +887,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -1143,6 +1145,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 extent_offset; u64 disk_bytenr; u64 num_bytes; + u64 disk_num_bytes; int extent_type; int ret, err; int type; @@ -1245,6 +1248,8 @@ next_slot: extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); + disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); if (extent_end <= start) { path->slots[0]++; goto next_slot; @@ -1319,6 +1324,7 @@ out_check: em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; + em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_PREALLOC, &em->flags); @@ -3696,6 +3702,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; + hole_em->orig_block_len = 0; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; @@ -5374,6 +5381,8 @@ again: em->len = extent_end - extent_start; em->orig_start = extent_start - btrfs_file_extent_offset(leaf, item); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); bytenr = btrfs_file_extent_disk_bytenr(leaf, item); if (bytenr == 0) { em->block_start = EXTENT_MAP_HOLE; @@ -5383,8 +5392,7 @@ again: set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; em->block_start = bytenr; - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); + em->block_len = em->orig_block_len; } else { bytenr += btrfs_file_extent_offset(leaf, item); em->block_start = bytenr; @@ -5414,6 +5422,7 @@ again: em->start = extent_start + extent_offset; em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + em->orig_block_len = em->len; em->orig_start = EXTENT_MAP_INLINE; if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -5721,6 +5730,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; /* @@ -5914,7 +5924,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - int type) + u64 orig_block_len, int type) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -5932,6 +5942,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_len = block_len; em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->orig_block_len = orig_block_len; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); @@ -6068,12 +6079,14 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_odirect(trans, inode, start, len) == 1) { u64 orig_start = em->start; + u64 orig_block_len = em->orig_block_len; if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, orig_start, - block_start, len, type); + block_start, len, + orig_block_len, type); if (IS_ERR(em)) { btrfs_end_transaction(trans, root); goto unlock_err; @@ -7771,6 +7784,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->len = ins.offset; em->block_start = ins.objectid; em->block_len = ins.offset; + em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; -- cgit v1.2.3 From b11e234d21e73df94099e473a080bca502b9a496 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Dec 2012 10:58:15 -0500 Subject: Btrfs: do not mark ems as prealloc if we are writing to them We are going to use EM's to log extents in the future, so we need to not mark them as prealloc if they aren't actually prealloc extents. Instead mark them with FILLING so we know to ammend mod_start/mod_len and that way we don't confuse the extent logging code. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 4 ++-- fs/btrfs/extent_map.h | 1 + fs/btrfs/inode.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d5c7f..85ae2b6fe03 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -266,9 +266,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { prealloc = true; - clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + clear_bit(EXTENT_FLAG_FILLING, &em->flags); } try_merge_map(tree, em); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 99a0dcb5ba2..922943ce29e 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -14,6 +14,7 @@ #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bfd59bcc50d..73e6833dcc2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1327,7 +1327,7 @@ out_check: em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -5945,7 +5945,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->orig_block_len = orig_block_len; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + set_bit(EXTENT_FLAG_FILLING, &em->flags); do { btrfs_drop_extent_cache(inode, em->start, -- cgit v1.2.3 From d6393786cd40f67709324bc4f08d7e4b911153fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Dec 2012 17:00:01 -0500 Subject: Btrfs: add path->really_keep_locks You'd think path->keep_locks would keep all the locks wouldn't you? You'd be wrong. It only keeps them if the slot is pointing to the last item in the node. This is for use with btrfs_next_leaf, which needs this sort of thing. But the horrible horrible things I'm going to do to the tree log means I really need everything held from root to leaf so I can add and delete items in the same search. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 7 +++++-- fs/btrfs/ctree.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 01efcbc80df..0c5c28ff794 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2212,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level, int no_skips = 0; struct extent_buffer *t; + if (path->really_keep_locks) + return; + for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; @@ -2259,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks) + if (path->keep_locks || path->really_keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2492,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (!cow) write_lock_level = -1; - if (cow && (p->keep_locks || p->lowest_level)) + if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; min_write_lock_level = write_lock_level; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 313a6adfde5..9ed452f5d06 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -576,6 +576,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int really_keep_locks:1; }; /* -- cgit v1.2.3 From 70c8a91ce21b83ccd2d9e7c968775430ead4353d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Oct 2012 16:54:30 -0400 Subject: Btrfs: log changed inodes based on the extent map tree We don't really need to copy extents from the source tree since we have all of the information already available to us in the extent_map tree. So instead just write the extents straight to the log tree and don't bother to copy the extent items from the source tree. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 133 ++++++++++++++++++++ fs/btrfs/ctree.h | 3 + fs/btrfs/extent_map.c | 20 ++- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 85 +++++-------- fs/btrfs/tree-log.c | 338 +++++++++++++++++++++++++++++--------------------- fs/btrfs/volumes.c | 1 + 7 files changed, 372 insertions(+), 210 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0c5c28ff794..e8b32641ea9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5490,6 +5490,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) return btrfs_next_old_leaf(root, path, 0); } +/* Release the path up to but not including the given level */ +static void btrfs_release_level(struct btrfs_path *path, int level) +{ + int i; + + for (i = 0; i < level; i++) { + path->slots[i] = 0; + if (!path->nodes[i]) + continue; + if (path->locks[i]) { + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } +} + +/* + * This function assumes 2 things + * + * 1) You are using path->keep_locks + * 2) You are not inserting items. + * + * If either of these are not true do not use this function. If you need a next + * leaf with either of these not being true then this function can be easily + * adapted to do that, but at the moment these are the limitations. + */ +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del) +{ + struct extent_buffer *b; + struct btrfs_key key; + u32 nritems; + int level = 1; + int slot; + int ret = 1; + int write_lock_level = BTRFS_MAX_LEVEL; + int ins_len = del ? -1 : 0; + + WARN_ON(!(path->keep_locks || path->really_keep_locks)); + + nritems = btrfs_header_nritems(path->nodes[0]); + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + while (path->nodes[level]) { + nritems = btrfs_header_nritems(path->nodes[level]); + if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { +search: + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, + ins_len, 1); + if (ret < 0) + goto out; + level = 1; + continue; + } + + if (path->slots[level] >= nritems - 1) { + level++; + continue; + } + + btrfs_release_level(path, level); + break; + } + + if (!path->nodes[level]) { + ret = 1; + goto out; + } + + path->slots[level]++; + b = path->nodes[level]; + + while (b) { + level = btrfs_header_level(b); + + if (!should_cow_block(trans, root, b)) + goto cow_done; + + btrfs_set_path_blocking(path); + ret = btrfs_cow_block(trans, root, b, + path->nodes[level + 1], + path->slots[level + 1], &b); + if (ret) + goto out; +cow_done: + path->nodes[level] = b; + btrfs_clear_path_blocking(path, NULL, 0); + if (level != 0) { + ret = setup_nodes_for_search(trans, root, path, b, + level, ins_len, + &write_lock_level); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + + b = path->nodes[level]; + slot = path->slots[level]; + + ret = read_block_for_search(trans, root, path, + &b, level, slot, &key, 0); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + level = btrfs_header_level(b); + if (!btrfs_try_tree_write_lock(b)) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(path, b, + BTRFS_WRITE_LOCK); + } + path->locks[level] = BTRFS_WRITE_LOCK; + path->nodes[level] = b; + path->slots[level] = 0; + } else { + path->slots[level] = 0; + ret = 0; + break; + } + } + +out: + if (ret) + btrfs_release_path(path); + + return ret; +} + int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ed452f5d06..55aff6764bb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3187,6 +3187,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 85ae2b6fe03..fff2c28497b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) struct extent_map *alloc_extent_map(void) { struct extent_map *em; - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); + em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; em->in_tree = 0; @@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(merge, em)) { em->start = merge->start; + em->orig_start = merge->orig_start; em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_start = em->start; - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); + list_move(&em->list, &tree->modified_extents); list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); @@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; - if (merge->generation > em->generation) { - em->mod_len = em->len; - em->generation = merge->generation; - list_move(&em->list, &tree->modified_extents); - } + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); list_del_init(&merge->list); free_extent_map(merge); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6810145f4e9..c56088ece50 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -621,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } else { split->block_len = split->len; split->block_start = em->block_start + diff; - split->orig_start = split->start; + split->orig_start = em->orig_start; } ret = add_extent_mapping(em_tree, split); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73e6833dcc2..355a297e798 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -95,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, + u64 len, u64 orig_start, + u64 block_start, u64 block_len, + u64 orig_block_len, int type); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -704,10 +708,14 @@ retry: em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -890,10 +898,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -1320,7 +1332,7 @@ out_check: em = alloc_extent_map(); BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; - em->orig_start = em->start; + em->orig_start = found_key.offset - extent_offset; em->len = num_bytes; em->block_len = num_bytes; em->block_start = disk_bytenr; @@ -1328,9 +1340,13 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); + em->generation = -1; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -5371,6 +5387,7 @@ again: if (start + len <= found_key.offset) goto not_found; em->start = start; + em->orig_start = start; em->len = found_key.offset - start; goto not_found_em; } @@ -5423,7 +5440,7 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_block_len = em->len; - em->orig_start = EXTENT_MAP_INLINE; + em->orig_start = em->start; if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; @@ -5476,6 +5493,7 @@ again: } not_found: em->start = start; + em->orig_start = start; em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; @@ -5677,30 +5695,14 @@ out: } static struct extent_map *btrfs_new_extent_direct(struct inode *inode, - struct extent_map *em, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; - bool insert = false; - - /* - * Ok if the extent map we looked up is a hole and is for the exact - * range we want, there is no reason to allocate a new one, however if - * it is not right then we need to free this one and drop the cache for - * our range. - */ - if (em->block_start != EXTENT_MAP_HOLE || em->start != start || - em->len != len) { - free_extent_map(em); - em = NULL; - insert = true; - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); - } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -5716,38 +5718,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, goto out; } - if (!em) { - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - } - - em->start = start; - em->orig_start = em->start; - em->len = ins.offset; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; - - /* - * We need to do this because if we're using the original em we searched - * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. - */ - em->flags = 0; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - while (insert) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); - } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, 0); + if (IS_ERR(em)) + goto out; ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); @@ -5943,6 +5917,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; + em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) set_bit(EXTENT_FLAG_FILLING, &em->flags); @@ -5952,6 +5927,9 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); + if (!ret) + list_move(&em->list, + &em_tree->modified_extents); write_unlock(&em_tree->lock); } while (ret == -EEXIST); @@ -6078,7 +6056,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { - u64 orig_start = em->start; + u64 orig_start = em->orig_start; u64 orig_block_len = em->orig_block_len; if (type == BTRFS_ORDERED_PREALLOC) { @@ -6110,7 +6088,8 @@ must_cow: * it above */ len = bh_result->b_size; - em = btrfs_new_extent_direct(inode, em, start, len); + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ab7168ee618..72444811d27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3150,145 +3150,220 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -struct log_args { - struct extent_buffer *src; - u64 next_offset; - int start_slot; - int nr; -}; +static int drop_adjacent_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct extent_map *em, + struct btrfs_path *path) +{ + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_key key, new_key; + struct btrfs_map_token token; + u64 extent_end; + u64 extent_offset = 0; + int extent_type; + int del_slot = 0; + int del_nr = 0; + int ret = 0; + + while (1) { + btrfs_init_map_token(&token); + leaf = path->nodes[0]; + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + if (del_nr) { + ret = btrfs_del_items(trans, root, path, + del_slot, del_nr); + if (ret) + return ret; + del_nr = 0; + } + + ret = btrfs_next_leaf_write(trans, root, path, 1); + if (ret < 0) + return ret; + if (ret > 0) + return 0; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || + key.type != BTRFS_EXTENT_DATA_KEY || + key.offset >= em->start + em->len) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_token_file_extent_type(leaf, fi, &token); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_offset = btrfs_token_file_extent_offset(leaf, + fi, &token); + extent_end = key.offset + + btrfs_token_file_extent_num_bytes(leaf, fi, + &token); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); + } else { + BUG(); + } + + if (extent_end <= em->len + em->start) { + if (!del_nr) { + del_slot = path->slots[0]; + } + del_nr++; + continue; + } + + /* + * Ok so we'll ignore previous items if we log a new extent, + * which can lead to overlapping extents, so if we have an + * existing extent we want to adjust we _have_ to check the next + * guy to make sure we even need this extent anymore, this keeps + * us from panicing in set_item_key_safe. + */ + if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(leaf, &tmp_key, + path->slots[0] + 1); + if (tmp_key.objectid == btrfs_ino(inode) && + tmp_key.type == BTRFS_EXTENT_DATA_KEY && + tmp_key.offset <= em->start + em->len) { + if (!del_nr) + del_slot = path->slots[0]; + del_nr++; + continue; + } + } + + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = em->start + em->len; + btrfs_set_item_key_safe(trans, root, path, &new_key); + extent_offset += em->start + em->len - key.offset; + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - + (em->start + em->len), + &token); + btrfs_mark_buffer_dirty(leaf); + } + + if (del_nr) + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + + return ret; +} static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct btrfs_path *dst_path, struct log_args *args) + struct extent_map *em, struct btrfs_path *path) { struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct list_head ordered_sums; struct btrfs_key key; - u64 start = em->mod_start; - u64 search_start = start; - u64 len = em->mod_len; - u64 num_bytes; - int nritems; + u64 csum_offset = em->mod_start - em->start; + u64 csum_len = em->mod_len; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; int ret; + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (BTRFS_I(inode)->logged_trans == trans->transid) { - ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, - start + len, NULL, 0); - if (ret) - return ret; + INIT_LIST_HEAD(&ordered_sums); + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + path->really_keep_locks = 1; + + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); + if (ret && ret != -EEXIST) { + path->really_keep_locks = 0; + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, em->generation); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + skip_csum = true; + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC); + } else { + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + if (em->block_start == 0) + skip_csum = true; + } + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + } else { + btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); } - while (len) { - if (args->nr) - goto next_slot; -again: - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = search_start; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - - if (ret) { - /* - * A rare case were we can have an em for a section of a - * larger extent so we need to make sure that this em - * falls within the extent we've found. If not we just - * bail and go back to ye-olde way of doing things but - * it happens often enough in testing that we need to do - * this dance to make sure. - */ - do { - if (path->slots[0] == 0) { - btrfs_release_path(path); - if (search_start == 0) - return -ENOENT; - search_start--; - goto again; - } + btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); + btrfs_set_file_extent_num_bytes(leaf, fi, em->len); + btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); + btrfs_set_file_extent_compression(leaf, fi, em->compress_type); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_mark_buffer_dirty(leaf); - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - btrfs_release_path(path); - return -ENOENT; - } - } while (key.offset > start); + /* + * Have to check the extent to the right of us to make sure it doesn't + * fall in our current range. We're ok if the previous extent is in our + * range since the recovery stuff will run us in key order and thus just + * drop the part we overwrote. + */ + ret = drop_adjacent_extents(trans, log, inode, em, path); + btrfs_release_path(path); + path->really_keep_locks = 0; + if (ret) { + return ret; + } - num_bytes = btrfs_file_extent_length(path); - if (key.offset + num_bytes <= start) { - btrfs_release_path(path); - return -ENOENT; - } - } - args->src = path->nodes[0]; -next_slot: - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - num_bytes = btrfs_file_extent_length(path); - if (args->nr && - args->start_slot + args->nr == path->slots[0]) { - args->nr++; - } else if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 1; - args->start_slot = path->slots[0]; - } else if (!args->nr) { - args->nr = 1; - args->start_slot = path->slots[0]; - } - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (len < num_bytes) { - /* I _think_ this is ok, envision we write to a - * preallocated space that is adjacent to a previously - * written preallocated space that gets merged when we - * mark this preallocated space written. If we do not - * have the adjacent extent in cache then when we copy - * this extent it could end up being larger than our EM - * thinks it is, which is a-ok, so just set len to 0. - */ - len = 0; - } else { - len -= num_bytes; - } - start = key.offset + num_bytes; - args->next_offset = start; - search_start = start; + if (skip_csum) + return 0; - if (path->slots[0] < nritems) { - if (len) - goto next_slot; - break; - } + /* block start is already adjusted for the file extent offset. */ + ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0); + if (ret) + return ret; - if (args->nr) { - ret = copy_items(trans, inode, dst_path, args->src, - args->start_slot, args->nr, - LOG_INODE_ALL); - if (ret) - return ret; - args->nr = 0; - btrfs_release_path(path); - } + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); + list_del(&sums->list); + kfree(sums); } - return 0; + return ret; } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *path) { - struct log_args args; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -3297,8 +3372,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - memset(&args, 0, sizeof(args)); - write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -3331,34 +3404,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); - /* - * If the previous EM and the last extent we left off on aren't - * sequential then we need to copy the items we have and redo - * our search - */ - if (args.nr && em->mod_start != args.next_offset) { - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, - LOG_INODE_ALL); - if (ret) { - free_extent_map(em); - write_lock(&tree->lock); - continue; - } - btrfs_release_path(path); - args.nr = 0; - } - - ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + ret = log_one_extent(trans, inode, root, em, path); free_extent_map(em); write_lock(&tree->lock); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - if (!ret && args.nr) - ret = copy_items(trans, inode, dst_path, args.src, - args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); return ret; } @@ -3551,10 +3603,8 @@ next_slot: log_extents: if (fast_search) { - btrfs_release_path(path); btrfs_release_path(dst_path); - ret = btrfs_log_changed_extents(trans, root, inode, path, - dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; goto out_unlock; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 886f4ba0f71..d79b5b620e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4983,6 +4983,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em->bdev = (struct block_device *)map; em->start = logical; em->len = length; + em->orig_start = 0; em->block_start = 0; em->block_len = em->len; -- cgit v1.2.3 From bb146eb265091f472ada52a3419d41e9b0ff1f7d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:30:43 -0400 Subject: Btrfs: move checks in set_page_dirty under DEBUG This is a high traffic function, let's try and do as little as possible during normal operations shall we? Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index faf182691b4..b8f7f04a640 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1001,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) static int btree_set_page_dirty(struct page *page) { +#ifdef DEBUG struct extent_buffer *eb; BUG_ON(!PagePrivate(page)); @@ -1009,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_locked(eb); +#endif return __set_page_dirty_nobuffers(page); } -- cgit v1.2.3 From ed7b63eb8afd0bb8d978a23184d70c105b54aa26 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:33:54 -0400 Subject: Btrfs: only clear dirty on the buffer if it is marked as dirty No reason to set the path blocking or loop through all of the pages if the extent buffer isn't actually marked dirty. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8f7f04a640..65f03670a95 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1142,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, root->fs_info->dirty_metadata_bytes); } spin_unlock(&root->fs_info->delalloc_lock); - } - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(buf); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); + } } } -- cgit v1.2.3 From ad9145596986b672d8c8235c92ed5307f82d045d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:39:33 -0400 Subject: Btrfs: don't memset new tokens Our token logic depends on token->kaddr being set, and if it is not it sets everything properly as needed. So instead of memsetting just set token->kaddr to NULL. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 55aff6764bb..cd02205f13c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1860,7 +1860,7 @@ struct btrfs_map_token { static inline void btrfs_init_map_token (struct btrfs_map_token *token) { - memset(token, 0, sizeof(*token)); + token->kaddr = NULL; } /* some macros to generate set/get funcs for the struct fields. This -- cgit v1.2.3 From 41be1f3b40b87de33cd2e7463dce88596dbdccc4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 15 Oct 2012 13:43:18 -0400 Subject: Btrfs: optimize leaf_space_used This gets called at least 4 times for every level while adding an object, and it involves 3 kmapping calls, which on my box take about 5us a piece. So instead use a token, which brings us down to 1 kmap call and makes this function take 1/3 of the time per call. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e8b32641ea9..e7bea1d5f75 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3298,14 +3298,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { + struct btrfs_item *start_item; + struct btrfs_item *end_item; + struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - data_len = btrfs_item_end_nr(l, start); - data_len = data_len - btrfs_item_offset_nr(l, end); + btrfs_init_map_token(&token); + start_item = btrfs_item_nr(l, start); + end_item = btrfs_item_nr(l, end); + data_len = btrfs_token_item_offset(l, start_item, &token) + + btrfs_token_item_size(l, start_item, &token); + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; -- cgit v1.2.3 From 0b1c6ccadee4ea4adb98799f3430fc72e57a187f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Oct 2012 16:03:44 -0400 Subject: Btrfs: use tokens where we can in the tree log If we are syncing over and over the overhead of doing all those maps in fill_inode_item and log_changed_extents really starts to hurt, so use map tokens so we can avoid all the extra mapping. Since the token maps from our offset to the end of the page make sure to set the first thing in the item first so we really only do one map. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 127 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 54 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 72444811d27..83186c7e45d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode, int log_inode_only) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (log_inode_only) { /* set the generation to zero so the recover code @@ -2986,14 +2962,43 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_inode_generation(leaf, item, 0); - btrfs_set_inode_size(leaf, item, 0); + btrfs_set_token_inode_generation(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, 0, &token); } else { - btrfs_set_inode_generation(leaf, item, - BTRFS_I(inode)->generation); - btrfs_set_inode_size(leaf, item, inode->i_size); - } - + btrfs_set_token_inode_generation(leaf, item, + BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + } + + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -3267,6 +3272,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct list_head ordered_sums; + struct btrfs_map_token token; struct btrfs_key key; u64 csum_offset = em->mod_start - em->start; u64 csum_len = em->mod_len; @@ -3276,6 +3282,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; INIT_LIST_HEAD(&ordered_sums); + btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; @@ -3289,37 +3296,49 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, em->generation); + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { skip_csum = true; - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); } else { - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); if (em->block_start == 0) skip_csum = true; } block_len = max(em->block_len, em->orig_block_len); if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, em->block_start); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, block_len); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); } else { - btrfs_set_file_extent_disk_bytenr(leaf, fi, 0); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, 0); - } - - btrfs_set_file_extent_offset(leaf, fi, em->start - em->orig_start); - btrfs_set_file_extent_num_bytes(leaf, fi, em->len); - btrfs_set_file_extent_ram_bytes(leaf, fi, em->len); - btrfs_set_file_extent_compression(leaf, fi, em->compress_type); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, + em->start - em->orig_start, + &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); btrfs_mark_buffer_dirty(leaf); /* -- cgit v1.2.3 From 5124e00ec5b0be56155a11aec416fcc5125339f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 7 Nov 2012 13:44:13 -0500 Subject: Btrfs: only unlock and relock if we have to I noticed while doing fsync tests that we were always dropping the path and re-searching when we first cow the log root even though we've already gotten the write lock on the root. That's because we don't take into account that there might not be a parent node, so fix the check to make sure there is actually a parent node before we undo all of this work for nothing. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e7bea1d5f75..c7b67cf24bb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,7 +2564,10 @@ again: * must have write locks on this node and the * parent */ - if (level + 1 > write_lock_level) { + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { write_lock_level = level + 1; btrfs_release_path(p); goto again; -- cgit v1.2.3 From 6c760c072403f446ff829ec9e89568943a3c2ef2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Nov 2012 10:53:21 -0500 Subject: Btrfs: do not call file_update_time in aio_write This starts a transaction and dirties the inode everytime we call it, which is super expensive if you have a write heavy workload. We will be updating the inode when the IO completes and we reserve the space for the inode update when we reserve space for the write, so there is no chance of loss of information or enospc issues. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 35 ++++++++++++++++++++++++++++++----- fs/btrfs/inode.c | 42 ++++++++++++++++++------------------------ 2 files changed, 48 insertions(+), 29 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c56088ece50..20452c110d7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1464,6 +1464,24 @@ out: return written ? written : err; } +static void update_time_for_write(struct inode *inode) +{ + struct timespec now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + static ssize_t btrfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) @@ -1519,11 +1537,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - err = file_update_time(file); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { @@ -1563,8 +1583,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * this will either be one more than the running transaction * or the generation used for the next transaction if there isn't * one running right now. + * + * We also have to set last_sub_trans to the current log transid, + * otherwise subsequent syncs to a file that's been synced in this + * transaction will appear to have already occured. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0 || num_written == -EIOCBQUEUED) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 355a297e798..1673dbdf1f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1922,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (nolock) + trans = btrfs_join_transaction_nolock(root); + else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); goto out; } @@ -1986,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); - goto out_unlock; - } - } else { - btrfs_set_inode_last_trans(trans, inode); + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode_fallback(trans, root, inode); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } ret = 0; out_unlock: -- cgit v1.2.3 From 4ded4f639533ed5f02a0f0ab20d43bb9659c91f8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 14 Nov 2012 18:57:29 +0000 Subject: Btrfs: fix BUG() in scrub when first superblock reading gives EIO This fixes a very special case that can be reproduced by just disconnecting a disk at runtime, and without unmounting the filesystem first, start scrub on the filesystem with the disconnected disk. All read and write EIOs are handled correctly, only the first superblock is an exception and gives a BUG() in a subfunction. The BUG() is correct, it would crash later otherwise. The subfunction must not be called for superblocks and this is what the fix changes. Reported-by: Joeri Vanthienen Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8db6a6413a5..bdbb94f245c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -785,6 +785,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BUG_ON(sblock_to_check->page_count < 1); fs_info = sctx->dev_root->fs_info; + if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + /* + * if we find an error in a super block, we just report it. + * They will get written with the next transaction commit + * anyway + */ + spin_lock(&sctx->stat_lock); + ++sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + return 0; + } length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0]->logical; generation = sblock_to_check->pagev[0]->generation; -- cgit v1.2.3 From 31e502298d80e2af9001d17dc419a3fd4b0bebef Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 21 Nov 2012 14:18:10 +0000 Subject: Btrfs: put raid properties into global table Raid properties can be shared among raid calculation code, we can put them into a global table to keep it simple. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/volumes.c | 46 ++++++++++++++++------------------------------ fs/btrfs/volumes.h | 9 +++++++++ 4 files changed, 29 insertions(+), 33 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd02205f13c..44d9bc87e86 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3077,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e1528098918..b9526f74904 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5479,7 +5479,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int __get_block_group_index(u64 flags) +int __get_raid_index(u64 flags) { int index; @@ -5499,7 +5499,7 @@ static int __get_block_group_index(u64 flags) static int get_block_group_index(struct btrfs_block_group_cache *cache) { - return __get_block_group_index(cache->flags); + return __get_raid_index(cache->flags); } enum btrfs_loop_type { @@ -7441,7 +7441,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ target = get_restripe_target(root->fs_info, block_group->flags); if (target) { - index = __get_block_group_index(extended_to_chunk(target)); + index = __get_raid_index(extended_to_chunk(target)); } else { /* * this is just a balance, so if we were marked as full diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d79b5b620e9..5cce6aa7401 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3491,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } +struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + { 2, 1, 0, 4, 2, 2 /* raid10 */ }, + { 1, 1, 2, 2, 2, 2 /* raid1 */ }, + { 1, 2, 1, 1, 1, 2 /* dup */ }, + { 1, 1, 0, 2, 1, 1 /* raid0 */ }, + { 1, 1, 0, 1, 1, 1 /* single */ }, +}; + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, @@ -3520,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int ndevs; int i; int j; + int index; BUG_ON(!alloc_profile_is_valid(type, 0)); if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; - sub_stripes = 1; - dev_stripes = 1; - devs_increment = 1; - ncopies = 1; - devs_max = 0; /* 0 == as many as possible */ - devs_min = 1; + index = __get_raid_index(type); - /* - * define the properties of each RAID type. - * FIXME: move this to a global table and use it in all RAID - * calculation code - */ - if (type & (BTRFS_BLOCK_GROUP_DUP)) { - dev_stripes = 2; - ncopies = 2; - devs_max = 1; - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { - devs_increment = 2; - ncopies = 2; - devs_max = 2; - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - sub_stripes = 2; - devs_increment = 2; - ncopies = 2; - devs_min = 4; - } else { - devs_max = 1; - } + sub_stripes = btrfs_raid_array[index].sub_stripes; + dev_stripes = btrfs_raid_array[index].dev_stripes; + devs_max = btrfs_raid_array[index].devs_max; + devs_min = btrfs_raid_array[index].devs_min; + devs_increment = btrfs_raid_array[index].devs_increment; + ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37d0157167b..d3c3939ac75 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -180,6 +180,15 @@ struct btrfs_device_info { u64 total_avail; }; +struct btrfs_raid_attr { + int sub_stripes; /* sub_stripes info for map */ + int dev_stripes; /* stripes per dev */ + int devs_max; /* max devs to use */ + int devs_min; /* min devs needed */ + int devs_increment; /* ndevs has to be a multiple of this */ + int ncopies; /* how many copies to data has */ +}; + struct map_lookup { u64 type; int io_align; -- cgit v1.2.3 From 9185aa587b7425f8f4520da2e66792f5f3c2b815 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Fri, 30 Nov 2012 03:40:08 +0000 Subject: Btrfs: fix permissions of empty files not affected by umask When a new file is created with btrfs_create(), the inode will initially be created with permissions 0666 and later on in btrfs_init_acl() it will be adapted to mask out the umask bits. The problem is that this change won't make it into the btrfs_inode unless there's another change to the inode (e.g. writing content changing the size or touching the file changing the mtime.) This fix adds a call to btrfs_update_inode() to btrfs_create() to make sure that the change will not get lost if the in-memory inode is flushed before other changes are made to the file. Signed-off-by: Filipe Brandenburger Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1673dbdf1f7..2e6918c85b7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5041,6 +5041,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock; + /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see -- cgit v1.2.3 From 1135d6df222046a0ec14a1c9335de99907879922 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Dec 2012 13:46:43 -0500 Subject: Btrfs: fix autodefrag and umount lockup This happens because writeback_inodes_sb_nr_if_idle does down_read. This doesn't work for us and it has not been fixed upstream yet, so do it ourselves and use that instead so we can stop having this stupid long standing lockup. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b9526f74904..afc3ac5e57d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3689,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root, return 0; } +static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, + unsigned long nr_pages, + enum wb_reason reason) +{ + if (!writeback_in_progress(sb->s_bdi) && + down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, reason); + up_read(&sb->s_umount); + return 1; + } + + return 0; +} + /* * shrink metadata reservation for delalloc */ @@ -3721,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, + nr_pages, + WB_REASON_FS_FREE_SPACE); /* * We need to wait for the async pages to actually start before -- cgit v1.2.3 From c64c2bd890df3b9a66c52c33df110777058c011e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Dec 2012 13:48:14 -0500 Subject: Btrfs: don't take inode delalloc mutex if we're a free space inode This confuses and angers lockdep even though it's ok. We don't really need the lock for free space inodes since only the transaction committer will be reserving space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index afc3ac5e57d..d133edfcd44 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4535,16 +4535,25 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret; + bool delalloc_lock = true; - /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(inode)) + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } if (flush != BTRFS_RESERVE_NO_FLUSH && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4577,7 +4586,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } } @@ -4616,7 +4626,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4628,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(root->fs_info,"delalloc", -- cgit v1.2.3 From 9c52057c698fb96f8f07e7a4bcf4801a092bda89 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 17 Dec 2012 14:26:57 -0500 Subject: Btrfs: fix hash overflow handling The handling for directory crc hash overflows was fairly obscure, split_leaf returns EOVERFLOW when we try to extend the item and that is supposed to bubble up to userland. For a while it did so, but along the way we added better handling of errors and forced the FS readonly if we hit IO errors during the directory insertion. Along the way, we started testing only for EEXIST and the EOVERFLOW case was dropped. The end result is that we may force the FS readonly if we catch a directory hash bucket overflow. This fixes a few problem spots. First I add tests for EOVERFLOW in the places where we can safely just return the error up the chain. btrfs_rename is harder though, because it tries to insert the new directory item only after it has already unlinked anything the rename was going to overwrite. Rather than adding very complex logic, I added a helper to test for the hash overflow case early while it is still safe to bail out. Snapshot and subvolume creation had a similar problem, so they are using the new helper now too. Signed-off-by: Chris Mason Reported-by: Pascal Junod --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/dir-item.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 24 +++++++++++++++++++- fs/btrfs/ioctl.c | 10 +++++++++ fs/btrfs/transaction.c | 2 +- 5 files changed, 95 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 44d9bc87e86..547b7b05727 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3283,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c1a074d0696..502c2158167 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + /* return back any errors */ + if (ret < 0) + goto out; + + /* nothing found, we're safe */ + if (ret > 0) { + ret = 0; + goto out; + } + + /* we found an item, look for our name in the item */ + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } + + /* + * see if there is room in the item to insert this + * name + */ + data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size_nr(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; +} + /* * lookup a directory item based on index. 'dir' is the objectid * we're searching in, and 'mod' tells us if you plan on deleting the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2e6918c85b7..e95b1f90a1f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4885,7 +4885,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - if (ret == -EEXIST) + if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -7336,6 +7336,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + + + /* check for collisions, even if the name isn't there */ + ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, + new_dentry->d_name.name, + new_dentry->d_name.len); + + if (ret) { + if (ret == -EEXIST) { + /* we shouldn't get + * eexist without a new_inode */ + if (!new_inode) { + WARN_ON(1); + return ret; + } + } else { + /* maybe -EOVERFLOW */ + return ret; + } + } + ret = 0; + /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 657d83ca9de..d4608ab72b7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -710,6 +710,16 @@ static noinline int btrfs_mksubvol(struct path *parent, if (error) goto out_dput; + /* + * even if this name doesn't exist, we may get hash collisions. + * check for them now when we can safely fail + */ + error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, + dir->i_ino, name, + namelen); + if (error) + goto out_dput; + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e6509b92433..87fac9a21ea 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1190,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode, &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ - BUG_ON(ret == -EEXIST); + BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, root, ret); goto fail; -- cgit v1.2.3 From 213490b301773ea9c6fb89a86424a6901fcdd069 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 11 Sep 2012 08:33:50 -0600 Subject: Btrfs: fix a bug of per-file nocow Users report a bug, the reproducer is: $ mkfs.btrfs /dev/loop0 $ mount /dev/loop0 /mnt/btrfs/ $ mkdir /mnt/btrfs/dir $ chattr +C /mnt/btrfs/dir/ $ dd if=/dev/zero of=/mnt/btrfs/dir/foo bs=4K count=10; $ lsattr /mnt/btrfs/dir/foo ---------------C- /mnt/btrfs/dir/foo $ filefrag /mnt/btrfs/dir/foo /mnt/btrfs/dir/foo: 1 extent found ---> an extent $ dd if=/dev/zero of=/mnt/btrfs/dir/foo bs=4K count=1 seek=5 conv=notrunc,nocreat; sync $ filefrag /mnt/btrfs/dir/foo /mnt/btrfs/dir/foo: 3 extents found ---> with nocow, btrfs breaks the extent into three parts The new created file should not only inherit the NODATACOW flag, but also honor NODATASUM flag, because we must do COW on a file extent with checksum. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +-- fs/btrfs/ioctl.c | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e95b1f90a1f..67ed24ae86b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4818,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW) || - (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) + if (btrfs_test_opt(root, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4608ab72b7..7624212ae92 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -141,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; } - if (flags & BTRFS_INODE_NODATACOW) + if (flags & BTRFS_INODE_NODATACOW) { BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + } btrfs_update_iflags(inode); } -- cgit v1.2.3 From 965c8e59cfcf845ecde2265a1d1bfee5f011d302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:39 -0800 Subject: lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a8ee75cb96e..9c6673a9231 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2120,7 +2120,7 @@ out: return ret; } -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em; @@ -2154,7 +2154,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) * before the position we want in case there is outstanding delalloc * going on here. */ - if (origin == SEEK_HOLE && start != 0) { + if (whence == SEEK_HOLE && start != 0) { if (start <= root->sectorsize) em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, root->sectorsize, 0); @@ -2188,13 +2188,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - if (origin == SEEK_HOLE) { + if (whence == SEEK_HOLE) { *offset = start; free_extent_map(em); break; } } else { - if (origin == SEEK_DATA) { + if (whence == SEEK_DATA) { if (em->block_start == EXTENT_MAP_DELALLOC) { if (start >= inode->i_size) { free_extent_map(em); @@ -2231,16 +2231,16 @@ out: return ret; } -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; mutex_lock(&inode->i_mutex); - switch (origin) { + switch (whence) { case SEEK_END: case SEEK_CUR: - offset = generic_file_llseek(file, offset, origin); + offset = generic_file_llseek(file, offset, whence); goto out; case SEEK_DATA: case SEEK_HOLE: @@ -2249,7 +2249,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) return -ENXIO; } - ret = find_desired_extent(inode, &offset, origin); + ret = find_desired_extent(inode, &offset, whence); if (ret) { mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.2.3 From 4c3e696981a565aace08678e70c40709a85f9b2b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 18 Dec 2012 15:43:18 -0500 Subject: Revert "Btrfs: MOD_LOG_KEY_REMOVE_WHILE_MOVING never change node's nritems" This reverts commit 95c80bb1f6b24b57058d971ed252b2c1c5121b51. The bug addressed by this commit was fixed differently back in 3.6 Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c7b67cf24bb..569c0dfb526 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1138,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); - case MOD_LOG_KEY_REMOVE: - n++; case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: btrfs_set_node_key(eb, &tm->key, tm->slot); btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); + n++; break; case MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); -- cgit v1.2.3 From 57ba86c00f9573b63b8c06810d4f6915efed2442 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 18 Dec 2012 19:35:32 -0500 Subject: Revert "Btrfs: reorder tree mod log operations in deleting a pointer" This reverts commit 6a7a665d78c5dd8bc76a010648c4e7d84517ab5a. This was bug was fixed differently in 3.6, so this commit isn't needed. Conflicts: fs/btrfs/ctree.c Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 569c0dfb526..eea5da7a2b9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4611,12 +4611,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 nritems; int ret; - if (level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); - } - nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (level) @@ -4627,6 +4621,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); + } else if (level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); } nritems--; -- cgit v1.2.3 From 39e3c9553f34381a1b664c27b0c696a266a5735e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 28 Nov 2012 11:30:53 -0500 Subject: vfs: remove DCACHE_NEED_LOOKUP The code that relied on that flag was ripped out of btrfs quite some time ago, and never added back. Josef indicated that he was going to take a different approach to the problem in btrfs, and that we could just eliminate this flag. Cc: Josef Bacik Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/btrfs/inode.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86b..16d9e8e191e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4262,16 +4262,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - if (unlikely(d_need_lookup(dentry))) { - memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); - kfree(dentry->d_fsdata); - dentry->d_fsdata = NULL; - /* This thing is hashed, drop it for now */ - d_drop(dentry); - } else { - ret = btrfs_inode_by_name(dir, dentry, &location); - } - + ret = btrfs_inode_by_name(dir, dentry, &location); if (ret < 0) return ERR_PTR(ret); @@ -4341,11 +4332,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct dentry *ret; ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - if (unlikely(d_need_lookup(dentry))) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_NEED_LOOKUP; - spin_unlock(&dentry->d_lock); - } return ret; } -- cgit v1.2.3 From cfa7a9ccda711ac6ab8f0d17c3a9b540092d305a Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 17 Dec 2012 06:38:51 +0000 Subject: Btrfs: fix memory leak in name_cache_insert() We should free name_cache_entry before returning from the error handling code. Signed-off-by: Tsutomu Itoh --- fs/btrfs/send.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad4..321b7fb4e44 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); -- cgit v1.2.3 From cc975eb4605c5765a5d5e7a51d24ba5a1cda269e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 7 Dec 2012 10:09:19 +0000 Subject: btrfs: get the device in write mode when deleting it When we're deleting the device we should get it in write mode since we're going to re-write the super block magic on that device. And it should fail if the device is read-only. Signed-off-by: Lukas Czerner --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa7401..86279c37de6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) -- cgit v1.2.3 From d86e56cf7d3669dd292012ac82b986bd1573b6cc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 15 Nov 2012 11:35:41 +0000 Subject: Btrfs: disable qgroup id 0 Qgroup id 0 is a special number, we should set the id of a qgroup to 0. Fix it. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7624212ae92..dd8e3448fe8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3698,6 +3698,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); -- cgit v1.2.3 From 5c39da5b6ca23e68e7acea7f4c01470383475214 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Oct 2012 11:39:53 +0000 Subject: Btrfs: do not delete a subvolume which is in a R/O subvolume Step to reproduce: # mkfs.btrfs # mount # btrfs sub create /subv0 # btrfs sub snap /subv0/snap0 # change /subv0 from R/W to R/O # btrfs sub del /subv0/snap0 We deleted the snapshot successfully. I think we should not be able to delete the snapshot since the parent subvolume is R/O. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dd8e3448fe8..5a72896bd76 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2095,13 +2095,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; -- cgit v1.2.3 From dba60f3f5d564167118cad151a7d41dfe8d2a5f7 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 09:19:51 +0000 Subject: Btrfs: fix resize a readonly device We should not resize a readonly device, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a72896bd76..0de21213d05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1362,6 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1370,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; -- cgit v1.2.3 From 97547676570b3bd908560741315bf4b7d635bcf5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 10:38:50 +0000 Subject: Btrfs: fix missing write access release in btrfs_ioctl_resize() We forget to give up the write access after we find some device operation is going on. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de21213d05..982c0b9ceea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1339,6 +1339,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } -- cgit v1.2.3 From 72bcd99d450cb1dde8bf13c3b65fc5883b2a3893 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 15:16:34 -0500 Subject: Btrfs: set flushing if we're limited flushing We still need to say we're flushing if we're limit flushing to keep somebody from coming in and stealing our reservation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d133edfcd44..61fefda74ff 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } -- cgit v1.2.3 From f3fe820c20a1a36c790545184e734e78d61cd68d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Jan 2013 17:03:21 -0500 Subject: Btrfs: add orphan before truncating pagecache Running xfstests 83 in a loop would sometimes fail the fsck. This happens because if we invalidate a page that already has an ordered extent setup for it we will complete the ordered extent ourselves, assuming that the truncate will clean everything up. The problem with this is there is plenty of time for the truncate to fail after we've done this work. So to fix this we need to add the orphan item first to make sure the cleanup gets done properly, and then we can truncate the pagecache and all that stuff and be safe. This fixes the btrfsck failures I was seeing while running 83 in a loop. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86b..4ddcf79e789 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3783,9 +3795,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -6929,11 +6966,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6944,12 +6979,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7018,12 +7047,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { -- cgit v1.2.3 From ac5c93005b7073732e268606688fb6c821d5310e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:24 +0000 Subject: Btrfs: let allocation start from the right raid type This'd avoid us empty looping. Say we have only one disk and the metadata raid type will be defaultly DUP, and we do not need to start from index=0(RAID10) and get over two empty loops to index=2(DUP). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61fefda74ff..aeba53191ec 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5560,7 +5560,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; -- cgit v1.2.3 From 3268a2468eb6a31af89930cbae58a62fe6ca6d2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 28 Dec 2012 09:33:19 +0000 Subject: Btrfs: reset path lock state to zero We forgot to reset the path lock state to zero after we unlock the path block, and this can lead to the ASSERT checker in tree unlock API. Reported-by: Slava Barinov Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aeba53191ec..85b8454d960 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6788,11 +6788,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } -- cgit v1.2.3 From 1214b53f90131fee1f950010c43e92455fe598ab Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 03:53:08 +0000 Subject: Btrfs: fix off-by-one in lseek Lock end is inclusive. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20452c110d7..fa48051484b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2242,6 +2242,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); -- cgit v1.2.3 From f9e4fb53938de5db01950c9dfe479703b2f5c964 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 10:10:12 +0000 Subject: Btrfs: fix a bug when llseek for delalloc bytes behind prealloc extents xfstests case 285 complains. It it because btrfs did not try to find unwritten delalloc bytes(only dirty pages, not yet writeback) behind prealloc extents, it ends up finding nothing while we're with SEEK_DATA. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 9 ++++++--- fs/btrfs/inode.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa48051484b..841cfe3be0e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2309,9 +2309,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4ddcf79e789..ac98384b174 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5623,10 +5623,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5708,6 +5711,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; -- cgit v1.2.3 From f276795627045a3c599a60b476767861e4318c7d Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 8 Jan 2013 19:37:58 +0000 Subject: btrfs: fix btrfs_cont_expand() freeing IS_ERR em btrfs_cont_expand() tries to free an IS_ERR em as it gets an error from btrfs_get_extent() and breaks out of its loop. An instance of -EEXIST was reported in the wild: https://bugzilla.redhat.com/show_bug.cgi?id=874407 I have no idea if that -EEXIST is surprising, or not. Regardless, this error handling should be cleaned up to handle other reasonable errors (ENOMEM, EIO; whatever). This seemed to be the only buggy freeing of the relatively rare IS_ERR em so I opted to fix the caller rather than teach free_extent_map() to use IS_ERR_OR_NULL(). Signed-off-by: Zach Brown Reviewed-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ac98384b174..3d2c64d4734 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3677,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); -- cgit v1.2.3 From 3972f2603d8570effaf633cea52b12c7c2773c11 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Jan 2013 02:57:22 +0000 Subject: btrfs: update timestamps on truncate() truncate() vs. ftruncate() differ in the VFS; truncate() doesn't set (ATTR_CTIME | ATTR_MTIME), and it's up to the fs to do the timestamp updates if the size changes. Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d2c64d4734..9bc6c40b182 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -3761,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3843,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } -- cgit v1.2.3 From ed0fb78fb6aa294a719f8f5654fdff0ec8bc00bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: bring back balance pause/resume logic Balance pause/resume logic got broken by 5ac00add (went in into 3.8-rc1 as part of dev-replace merge). Offending commit took a stab at making mutually exclusive volume operations (add_dev, rm_dev, resize, balance, replace_dev) not block behind volume_mutex if another such operation is in progress and instead return an error right away. Balancing front-end relied on the blocking behaviour, so the fix is ugly, but short of a complete rework, it's the best we can do. Reported-by: Liu Bo Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 78 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.c | 10 ++++--- 2 files changed, 71 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 982c0b9ceea..77d8273e394 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3440,8 +3440,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3450,14 +3450,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3477,13 +3524,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3504,11 +3548,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3516,12 +3566,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 86279c37de6..9c84dbe64f1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2959,6 +2959,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3140,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3160,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3182,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3235,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); -- cgit v1.2.3 From 2c0c9da02a2c4289350da6e54202a86602c0f926 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix "mutually exclusive op is running" error code The error code that is returned in response to starting a mutually exclusive operation when there is one already running got silently changed from EINVAL to EINPROGRESS by 5ac00add. Returning EINPROGRESS to, say, add_dev, when rm_dev is running is misleading. Furthermore, the operation itself may want to use EINPROGRESS for other purposes. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 77d8273e394..259dd52d878 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1340,7 +1340,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2192,7 +2192,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } ret = mnt_want_write_file(file); if (ret) { @@ -2266,7 +2266,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2303,7 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); -- cgit v1.2.3 From 18f39c416d18d74ac11d157e44247253d3fa30ae Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix unlock order in btrfs_ioctl_resize Fix unlock order in btrfs_ioctl_resize(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 259dd52d878..fcf1b1b4008 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1446,8 +1446,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From 4ac20c70b0734b65662ded735e5f6ba0415bdb71 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix unlock order in btrfs_ioctl_rm_dev Fix unlock order in btrfs_ioctl_rm_dev(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fcf1b1b4008..f5c1c150d9f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2319,8 +2319,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From 25122d15e21cf252e91e4cad7cea760f97df29f1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: reorder locks and sanity checks in btrfs_ioctl_defrag Operation-specific check (whether subvol is readonly or not) should go after the mutual exclusiveness check. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f5c1c150d9f..afbf3ac2079 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2186,19 +2186,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2250,8 +2251,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From ff24858c65d9c518af41aad22fb964685351051a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:08 -0700 Subject: Btrfs: ignore orphan qgroup relations If a qgroup that has still assignments is deleted by the user, the corresponding relations are left in the tree. This leads to an unmountable filesystem. With this patch, those relations are simple ignored. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8..28f2b39f6a2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: -- cgit v1.2.3 From 2cf687039676c2b6e1ee96b0b89090aca94babcd Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:09 -0700 Subject: Btrfs: prevent qgroup destroy when there are still relations Currently you can just destroy a qgroup even though it is in use by other qgroups or has qgroups assigned to it. This patch prevents destruction of qgroups unless they are completely unused. Otherwise destroy will return EBUSY. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 28f2b39f6a2..a5c85623432 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -963,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; -- cgit v1.2.3 From a105bb88f46b60de2adf1ee98745bd59362b09ab Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 21 Jan 2013 15:15:56 +0200 Subject: Btrfs: fix a regression in balance usage filter Commit 3fed40cc ("Btrfs: cleanup duplicated division functions"), which was merged into 3.8-rc1, has introduced a regression by removing logic that was guarding us against bad user input. Bring it back. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c84dbe64f1..46960983891 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2614,7 +2614,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; -- cgit v1.2.3 From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 13 ++++++++++++- fs/btrfs/extent_map.h | 1 + fs/btrfs/tree-log.c | 5 +++-- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fff2c28497b..ed88f5ee4be 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -256,7 +260,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -281,6 +286,12 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e..c6598c89cff 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d..de8899b04d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3410,13 +3410,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3424,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); -- cgit v1.2.3 From b0175117b9376a69978bbe80af26fb95dddbd53e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 11:39:19 -0500 Subject: Btrfs: fix panic when recovering tree log A user reported a BUG_ON(ret) that occured during tree log replay. Ret was -EAGAIN, so what I think happened is that we removed an extent that covered a bitmap entry and an extent entry. We remove the part from the bitmap and return -EAGAIN and then search for the next piece we want to remove, which happens to be an entire extent entry, so we just free the sucker and return. The problem is ret is still set to -EAGAIN so we trip the BUG_ON(). The user used btrfs-zero-log so I'm not 100% sure this is what happened so I've added a WARN_ON() to catch the other possibility. Thanks, Reported-by: Jan Steffens Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c..0be7a8742a4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); -- cgit v1.2.3 From 192000dda22e02225772e862b92e7c09e5a17d08 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 6 Jan 2013 03:38:22 +0000 Subject: Btrfs: use right range to find checksum for compressed extents For compressed extents, the range of checksum is covered by disk length, and the disk length is different with ram length, so we need to use disk length instead to get us the right checksum. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index de8899b04d6..9027bb1e746 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, -- cgit v1.2.3 From e58dd74bccb4317e39e4b675bf9c6cd133608fac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Jan 2013 15:43:09 -0500 Subject: Btrfs: put csums on the right ordered extent I noticed a WARN_ON going off when adding csums because we were going over the amount of csum bytes that should have been allowed for an ordered extent. This is a leftover from when we used to hold the csums privately for direct io, but now we use the normal ordered sum stuff so we need to make sure and check if we've moved on to another extent so that the csums are added to the right extent. Without this we could end up with csums for bytenrs that don't have extents to cover them yet. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef4235..94aa53b3872 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; -- cgit v1.2.3 From 8d25a086eb104297e3ba1fdd180b04cfaaa84797 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:27:25 +0000 Subject: Btrfs: Add ACCESS_ONCE() to transaction->abort accesses We may access and update transaction->aborted on the different CPUs without lock, so we need ACCESS_ONCE() wrapper to prevent the compiler from creating unsolicited accesses and make sure we can get the right value. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86..d8982e9601d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea..0ef29611fad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1468,7 +1468,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } -- cgit v1.2.3 From 2cba30f172afdfa00f3e844f42f21eb3b972d01c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:29:12 +0000 Subject: Btrfs: fix missed transaction->aborted check First, though the current transaction->aborted check can stop the commit early and avoid unnecessary operations, it is too early, and some transaction handles don't end, those handles may set transaction->aborted after the check. Second, when we commit the transaction, we will wake up some worker threads to flush the space cache and inode cache. Those threads also allocate some transaction handles and may set transaction->aborted if some serious error happens. So we need more check for ->aborted when committing the transaction. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0ef29611fad..f15494699f3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1575,6 +1575,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1658,6 +1663,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; -- cgit v1.2.3 From c9f01bfe0ca411b4751d7fdbb9d602035ba52f75 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 16 Jan 2013 11:27:17 +0000 Subject: Btrfs: fix wrong max device number for single profile The max device number of single profile is 1, not 0 (0 means 'as many as possible'). Fix it. Cc: Liu Bo Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 46960983891..15f6efdf646 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3507,7 +3507,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 1eafa6c73791e4f312324ddad9cbcaf6a1b6052b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:49:00 +0000 Subject: Btrfs: fix repeated delalloc work allocation btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ btrfs_queue_worker for this inode, and then it locks the list, checks the head of the list again. But because we don't delete the first inode that it deals with before, it will fetch the same inode. As a result, this function allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. Fix this problem by splice this delalloc list. Reported-by: Alex Lyakas Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9bc6c40b182..ca7ace7b7b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7585,41 +7585,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7632,11 +7652,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } -- cgit v1.2.3 From 3c911608085bf2d5a0822c418129f96a2a89d1b5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 31 Jan 2013 00:55:02 +0000 Subject: btrfs: don't try to notify udev about missing devices If we remove a missing device, bdev is null, and if we send that off to btrfs_kobject_uevent we'll panic. Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa7401..485a5423e3c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + if (bdev) + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); error_brelse: brelse(bh); -- cgit v1.2.3 From 222c81dc3874b4fe98371be665d0447a36447653 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 28 Jan 2013 09:45:20 -0500 Subject: Btrfs: do not merge logged extents if we've removed them from the tree You can run into this problem where if somebody is fsyncing and writing out the existing extents you will have removed the extent map from the em tree, but it's still valid for the current fsync so we go ahead and write it. The problem is we unconditionally try to merge it back into the em tree, but if we've removed it from the em tree that will cause use after free problems. Fix this to only merge if we are still a part of the tree. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ed88f5ee4be..9759911dd34 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -289,7 +289,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - try_merge_map(tree, em); + if (em->in_tree) + try_merge_map(tree, em); } /** -- cgit v1.2.3 From 0a3404dcff29a7589e8d6ce8c36f97c5411f85b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Jan 2013 12:34:55 +0000 Subject: Btrfs: fix wrong sync_writers decrement in btrfs_file_aio_write() If the checks at the beginning of btrfs_file_aio_write() fail, we needn't decrease ->sync_writers, because we have not increased it. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3be0e..a902faab716 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1595,9 +1595,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; -- cgit v1.2.3 From 843fcf35733164076a77ad833c72c32da8228ad0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Jan 2013 12:36:22 +0000 Subject: Btrfs: fix missing release of the space/qgroup reservation in start_transaction() When we fail to start a transaction, we need to release the reserved free space and qgroup space, fix it. Signed-off-by: Miao Xie Reviewed-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f15494699f3..fc03aa60b68 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,12 +333,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, &root->fs_info->trans_block_rsv, num_bytes, flush); if (ret) - return ERR_PTR(ret); + goto reserve_fail; } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } /* * If we are JOIN_NOLOCK we're already committing a transaction and @@ -365,11 +367,7 @@ again: if (ret < 0) { /* We must get the transaction if we are JOIN_NOLOCK. */ BUG_ON(type == TRANS_JOIN_NOLOCK); - - if (type < TRANS_JOIN_NOLOCK) - sb_end_intwrite(root->fs_info->sb); - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); + goto join_fail; } cur_trans = root->fs_info->running_transaction; @@ -410,6 +408,19 @@ got_it: if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; return h; + +join_fail: + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, -- cgit v1.2.3 From 6f1c36055f96e80031c7fdda3fd5be826b8d7782 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jan 2013 03:22:10 +0000 Subject: Btrfs: fix race between snapshot deletion and getting inode While running snapshot testscript created by Mitch and David, the race between autodefrag and snapshot deletion can lead to corruption of dead_root list so that we can get crash on btrfs_clean_old_snapshots(). And besides autodefrag, scrub also does the same thing, ie. read root first and get inode. Here is the story(take autodefrag as an example): (1) when we delete a snapshot or subvolume, it will set its root's refs to zero and do a iput() on its own inode, and if this inode happens to be the only active in-meory one in root's inode rbtree, it will add itself to the global dead_roots list for later cleanup. (2) after (1), the autodefrag thread may read another inode for defrag and the inode is just in the deleted snapshot/subvolume, but all of these are without checking if the root is still valid(refs > 0). So the end up result is adding the deleted snapshot/subvolume's root to the global dead_roots list AGAIN. Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu. So all we need to do is to take the lock to protect 'read root and get inode', since we synchronize to wait for the rcu grace period before adding something to the global dead_roots list. Reported-by: Mitch Harder Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 22 ++++++++++++++++++---- fs/btrfs/scrub.c | 25 ++++++++++++++++++++----- 2 files changed, 38 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a902faab716..b06d289f998 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c..67783e03d12 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) u64 physical_for_dev_replace; u64 len; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); -- cgit v1.2.3 From 5d1f40202bad12d4c70a2d40a420b30d23a72b1a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 Jan 2013 14:17:31 -0500 Subject: Btrfs: fix missing i_size update If we have an ordered extent before the ordered extent we are currently completing that is after the current disk_i_size we will put our i_size update into that ordered extent so that we do not expose stale data. The problem is that if our disk i_size is updated past the previous ordered extent we won't update the i_size with the pending i_size update. So check the pending i_size update and if its above the current disk i_size we need to go ahead and try to update. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f1073129704..fc6840b53d9 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } /* * walk backward from this ordered extent to disk_i_size. -- cgit v1.2.3 From 59fe4f41976f6331b695ff049296d082cf621823 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 Jan 2013 14:31:31 -0500 Subject: Btrfs: fix possible stale data exposure We specifically do not update the disk i_size if there are ordered extents outstanding for any area between the current disk_i_size and our ordered extent so that we do not expose stale data. The problem is the check we have only checks if the ordered extent starts at or after the current disk_i_size, which doesn't take into account an ordered extent that starts before the current disk_i_size and ends past the disk_i_size. Fix this by checking if the extent ends past the disk_i_size. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fc6840b53d9..e5ed5672960 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -877,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) { + if (entry_end(test) > disk_i_size) { /* * we don't update disk_i_size now, so record this * undealt i_size. Or we will not know the real -- cgit v1.2.3 From eb6b88d92c6df083dd09a8c471011e3788dfd7c6 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 27 Jan 2013 23:26:00 -0700 Subject: Btrfs: fix EDQUOT handling in btrfs_delalloc_reserve_metadata When btrfs_qgroup_reserve returned a failure, we were missing a counter operation for BTRFS_I(inode)->outstanding_extents++, leading to warning messages about outstanding extents and space_info->bytes_may_use != 0. Additionally, the error handling code didn't take into account that we dropped the inode lock which might require more cleanup. Luckily, all the cleanup code we need is already there and can be shared with reserve_metadata_bytes, which is exactly what this patch does. Reported-by: Lev Vainblat Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 85b8454d960..61da9d0bb80 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) unsigned nr_extents = 0; int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret; + int ret = 0; bool delalloc_lock = true; /* If we are a free space inode we need to not flush since we will be in @@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) { + if (root->fs_info->quota_enabled) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); - if (ret) { - spin_lock(&BTRFS_I(inode)->lock); - calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; - } - } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + /* + * ret != 0 here means the qgroup reservation failed, we go straight to + * the shared error handling then. + */ + if (ret == 0) + ret = reserve_metadata_bytes(root, block_rsv, + to_reserve, flush); + if (ret) { u64 to_free = 0; unsigned dropped; -- cgit v1.2.3 From 1a65e24b0bb7dde48cac4a2bf74d5558f9e32ba7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 6 Feb 2013 12:06:02 -0500 Subject: Btrfs: move d_instantiate outside the transaction during mksubvol Dave Sterba triggered a lockdep complaint about lock ordering between the sb_internal lock and the cleaner semaphore. btrfs_lookup_dentry() checks for orphans if we're looking up the inode for a subvolume, and subvolume creation is triggering the lookup with a transaction running. This commit moves the d_instantiate after the transaction closes. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index afbf3ac2079..a31cd931d36 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: if (async_transid) { *async_transid = trans->transid; @@ -525,6 +524,10 @@ fail: } if (err && !ret) ret = err; + + if (!ret) + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + return ret; } -- cgit v1.2.3