From a0acae0e886d44bd5ce6d2f173c1ace0fcf0d9f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:22 -0800 Subject: freezer: unexport refrigerator() and update try_to_freeze() slightly There is no reason to export two functions for entering the refrigerator. Calling refrigerator() instead of try_to_freeze() doesn't save anything noticeable or removes any race condition. * Rename refrigerator() to __refrigerator() and make it return bool indicating whether it scheduled out for freezing. * Update try_to_freeze() to return bool and relay the return value of __refrigerator() if freezing(). * Convert all refrigerator() users to try_to_freeze(). * Update documentation accordingly. * While at it, add might_sleep() to try_to_freeze(). Signed-off-by: Tejun Heo Cc: Samuel Ortiz Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Steven Whitehouse Cc: Andrew Morton Cc: Jan Kara Cc: KONISHI Ryusuke Cc: Christoph Hellwig --- fs/btrfs/async-thread.c | 2 +- fs/btrfs/disk-io.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec14097fef..98ab240072e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -340,7 +340,7 @@ again: if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); - refrigerator(); + try_to_freeze(); } else { spin_unlock_irq(&worker->lock); if (!kthread_should_stop()) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62afe5c5694..622654fe051 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1579,9 +1579,7 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(root->fs_info); } - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1635,9 +1633,7 @@ sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && !btrfs_transaction_blocked(root->fs_info)) -- cgit v1.2.3 From 42b2aa86c6670347a2a07e6d7af0e0ecc8fdbff9 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Mon, 28 Nov 2011 20:31:00 -0800 Subject: treewide: Fix typos in various parts of the kernel, and fix some comments. The below patch fixes some typos in various parts of the kernel, as well as fixes some comments. Please let me know if I missed anything, and I will try to get it changed and resent. Signed-off-by: Justin P. Mattock Acked-by: Randy Dunlap Signed-off-by: Jiri Kosina --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 116ab67a06d..c3308c38ae7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1943,7 +1943,7 @@ enum btrfs_orphan_cleanup_state { }; /* - * This is called in transaction commmit time. If there are no orphan + * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv * structure. */ -- cgit v1.2.3 From cb54f2571ff3f5128f18efcf888ce3c051c589d9 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Mon, 21 Nov 2011 08:43:28 -0800 Subject: btrfs: free-space-cache.c: remove extra semicolon. The patch below removes an extra semicolon. Signed-off-by: Justin P. Mattock CC: Chris Mason CC: linux-btrfs@vger.kernel.org Signed-off-by: Jiri Kosina --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 181760f9d2a..75a7b114776 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -418,7 +418,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) } if (index == 0) - offset = sizeof(u32) * io_ctl->num_pages;; + offset = sizeof(u32) * io_ctl->num_pages; crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, PAGE_CACHE_SIZE - offset); -- cgit v1.2.3 From 32c7f202a4801252a0f3578807b75a961f792870 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 8 Aug 2011 15:19:47 -0600 Subject: btrfs: fix dirtied pages accounting on sub-page writes When doing 1KB sequential writes to the same page, balance_dirty_pages_ratelimited_nr() should be called once instead of 4 times, the latter makes the dirtier tasks be throttled much too heavy. Fix it with proper de-accounting on clear_page_dirty_for_io(). CC: Chris Mason Signed-off-by: Wu Fengguang --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe939c05..bfb620ead29 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1136,7 +1136,8 @@ again: GFP_NOFS); } for (i = 0; i < num_pages; i++) { - clear_page_dirty_for_io(pages[i]); + if (clear_page_dirty_for_io(pages[i])) + account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } -- cgit v1.2.3 From 5db0276014b80484689eb6c1bf7b94af1c7d5b1a Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 1 Nov 2011 17:04:16 +0100 Subject: Btrfs: add optional integrity check code The two files added in this patch contain all the code that is required to implement the integrity checks. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 3068 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/check-integrity.h | 36 + 2 files changed, 3104 insertions(+) create mode 100644 fs/btrfs/check-integrity.c create mode 100644 fs/btrfs/check-integrity.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 00000000000..ad0b3ba735b --- /dev/null +++ b/fs/btrfs/check-integrity.c @@ -0,0 +1,3068 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and a FLUSH request to the device where + * these blocks are located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "extent_io.h" +#include "disk-io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_BLOCK_SIZE PAGE_SIZE + +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num:2; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_bh_private; + union { + bio_end_io_t *bio; + bh_end_io_t *bh; + } orig_bio_bh_end_io; + int submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block refered from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; + char name[BDEVNAME_SIZE]; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char *data; + struct buffer_head *bh; /* do not use if set to NULL */ +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + int csum_size; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_root *root; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; +}; + +static void btrfsic_block_init(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_alloc(void); +static void btrfsic_block_free(struct btrfsic_block *b); +static void btrfsic_block_link_init(struct btrfsic_block_link *n); +static struct btrfsic_block_link *btrfsic_block_link_alloc(void); +static void btrfsic_block_link_free(struct btrfsic_block_link *n); +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h); +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices); +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + struct btrfs_header *hdr, + int limit_nesting, int force_iodone_flag); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static void btrfsic_dump_database(struct btrfsic_state *state); +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + const u8 *data, unsigned int size); +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, u8 *mapped_data, + unsigned int len, struct bio *bio, + int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char *data); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_bh_private = NULL; + b->orig_bio_bh_end_io.bio = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + ds->name[0] = '\0'; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block *const b = + list_entry(elem, struct btrfsic_block, + collision_resolving_node); + + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block_link *const l = + list_entry(elem, struct btrfsic_block_link, + collision_resolving_node); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_dev_state *const ds = + list_entry(elem, struct btrfsic_dev_state, + collision_resolving_node); + + if (ds->bdev == bdev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int pass; + + BUG_ON(NULL == state); + selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); + if (NULL == selected_super) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return -1; + } + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + printk(KERN_INFO "btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + state->csum_size = btrfs_super_csum_size(selected_super); + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + struct btrfs_header *hdr; + + ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(root @%llu," + " mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)BTRFSIC_BLOCK_SIZE) { + printk(KERN_INFO + "btrfsic: read @logical %llu failed!\n", + (unsigned long long) + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + hdr = (struct btrfs_header *)tmp_next_block_ctx.data; + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + hdr, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct buffer_head *bh; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); + if (NULL == bh) + return -1; + super_tmp = (struct btrfs_super_block *) + (bh->b_data + (dev_bytenr & 4095)); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, + sizeof(super_tmp->magic)) || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { + brelse(bh); + return 0; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + brelse(bh); + return -1; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, device->name, + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num)) { + printk(KERN_INFO "btrfsic: btrfsic_map_block(" + "bytenr @%llu, mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + brelse(bh); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + brelse(bh); + return -1; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + brelse(bh); + return -1; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + + brelse(bh); + return 0; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (NULL == sf) + printk(KERN_INFO "btrfsic: alloc memory failed!\n"); + else + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + struct btrfs_header *const first_hdr, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = le64_to_cpu(sf->hdr->generation); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(leafhdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "leaf %llu items %d generation %llu" + " owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + sf->nr, + (unsigned long long) + le64_to_cpu(leafhdr->header.generation), + (unsigned long long) + le64_to_cpu(leafhdr->header.owner)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item *disk_item = leafhdr->items + sf->i; + struct btrfs_disk_key *disk_key = &disk_item->key; + u8 type; + const u32 item_offset = le32_to_cpu(disk_item->offset); + + type = disk_key->type; + + if (BTRFS_ROOT_ITEM_KEY == type) { + const struct btrfs_root_item *const root_item = + (struct btrfs_root_item *) + (sf->block_ctx->data + + offsetof(struct btrfs_leaf, items) + + item_offset); + const u64 next_bytenr = + le64_to_cpu(root_item->bytenr); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + le64_to_cpu(root_item-> + generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.data; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(nodehdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "node %llu level %d items %d" + " generation %llu owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + (unsigned long long) + le64_to_cpu(nodehdr->header.generation), + (unsigned long long) + le64_to_cpu(nodehdr->header.owner)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr *disk_key_ptr = + nodehdr->ptrs + sf->i; + const u64 next_bytenr = + le64_to_cpu(disk_key_ptr->blockptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &disk_key_ptr->key, + le64_to_cpu(disk_key_ptr->generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.data; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) + goto one_stack_frame_backwards; + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + BTRFSIC_BLOCK_SIZE, + next_block_ctx, *mirror_nump); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + (unsigned long long)next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block), + (unsigned long long)next_block->logical_bytenr); + } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block)); + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)BTRFSIC_BLOCK_SIZE) { + printk(KERN_INFO + "btrfsic: read block @logical %llu failed!\n", + (unsigned long long)next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + int ret; + struct btrfs_file_extent_item *file_extent_item = + (struct btrfs_file_extent_item *)(block_ctx->data + + offsetof(struct btrfs_leaf, + items) + item_offset); + u64 next_bytenr = + le64_to_cpu(file_extent_item->disk_bytenr) + + le64_to_cpu(file_extent_item->offset); + u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); + u64 generation = le64_to_cpu(file_extent_item->generation); + struct btrfsic_block_link *l; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," + " offset = %llu, num_bytes = %llu\n", + file_extent_item->type, + (unsigned long long) + le64_to_cpu(file_extent_item->disk_bytenr), + (unsigned long long) + le64_to_cpu(file_extent_item->offset), + (unsigned long long) + le64_to_cpu(file_extent_item->num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || + ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) + return 0; + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > BTRFSIC_BLOCK_SIZE) + chunk_len = BTRFSIC_BLOCK_SIZE; + else + chunk_len = num_bytes; + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "btrfsic_handle_extent_data(" + "mirror_num=%d)\n", mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO + "\tdisk_bytenr = %llu, num_bytes %u\n", + (unsigned long long)next_bytenr, + chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block" + " @%llu (%s/%llu/%d)" + " found in hash table, D," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx.dev->name, + (unsigned long long) + next_block_ctx.dev_bytenr, + mirror_num, + (unsigned long long) + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + int ret; + u64 length; + struct btrfs_bio *multi = NULL; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + bytenr, &length, &multi, mirror_num); + + device = multi->stripes[0].dev; + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->data = NULL; + block_ctx_out->bh = NULL; + + if (0 == ret) + kfree(multi); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out) +{ + block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); + block_ctx_out->dev_bytenr = bytenr; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->data = NULL; + block_ctx_out->bh = NULL; + if (NULL != block_ctx_out->dev) { + return 0; + } else { + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); + return -ENXIO; + } +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (NULL != block_ctx->bh) { + brelse(block_ctx->bh); + block_ctx->bh = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + block_ctx->bh = NULL; + if (block_ctx->dev_bytenr & 4095) { + printk(KERN_INFO + "btrfsic: read_block() with unaligned bytenr %llu\n", + (unsigned long long)block_ctx->dev_bytenr); + return -1; + } + if (block_ctx->len > 4096) { + printk(KERN_INFO + "btrfsic: read_block() with too huge size %d\n", + block_ctx->len); + return -1; + } + + block_ctx->bh = __bread(block_ctx->dev->bdev, + block_ctx->dev_bytenr >> 12, 4096); + if (NULL == block_ctx->bh) + return -1; + block_ctx->data = block_ctx->bh->b_data; + + return block_ctx->len; +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + struct list_head *elem_all; + + BUG_ON(NULL == state); + + printk(KERN_INFO "all_blocks_list:\n"); + list_for_each(elem_all, &state->all_blocks_list) { + const struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *elem_ref_from; + + printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + + list_for_each(elem_ref_to, &b_all->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " refers %u* to" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each(elem_ref_from, &b_all->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, + struct btrfsic_block_link, + node_ref_from); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " is ref %u* from" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + printk(KERN_INFO "\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + const u8 *data, unsigned int size) +{ + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + int fail = 0; + int crc_fail = 0; + + h = (struct btrfs_header *)data; + + if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + fail++; + + crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, state->csum_size)) + crc_fail++; + + return fail || crc_fail; +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, + u8 *mapped_data, unsigned int len, + struct bio *bio, + int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + + WARN_ON(len > PAGE_SIZE); + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); + if (NULL != bio_is_patched) + *bio_is_patched = 0; + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr; + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + if (block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_super_block *) + mapped_data)->bytenr); + is_metadata = 1; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + printk(KERN_INFO + "[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_data)->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr, + mapped_data); + } + if (block->logical_bytenr != bytenr) { + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block), + (unsigned long long) + block->logical_bytenr); + block->logical_bytenr = bytenr; + } else if (state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } else { + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), old(gen=%llu," + " objectid=%llu, type=%d, offset=%llu)," + " new(gen=%llu)," + " which is referenced by most recent superblock" + " (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(block->disk_key.objectid), + block->disk_key.type, + (unsigned long long) + le64_to_cpu(block->disk_key.offset), + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_data)->generation), + (unsigned long long) + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," + " which is not yet iodone!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_data)->generation)); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + return; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valueable information + * like whether it was ever written and IO completed. + */ + list_for_each_safe(elem_ref_to, tmp_ref_to, + &block->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + if (block->is_superblock) + ret = btrfsic_map_superblock(state, bytenr, len, + bdev, &block_ctx); + else + ret = btrfsic_map_block(state, bytenr, len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", (unsigned long long)bytenr); + return; + } + block_ctx.data = mapped_data; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = + bio->bi_private; + block->orig_bio_bh_end_io.bio = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io. + bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_data); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + printk(KERN_INFO + "[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + (struct btrfs_header *) + block_ctx.data, + 0, 0); + } + if (ret) + printk(KERN_INFO + "btrfsic: btrfsic_process_metablock" + "(root @%llu) failed!\n", + (unsigned long long)dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "Written block (%s/%llu/?)" + " !found in hash table, D.\n", + dev_state->name, + (unsigned long long)dev_bytenr); + if (!state->include_extent_data) + return; /* ignore that written D block */ + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + block_ctx.start = bytenr; + block_ctx.len = len; + block_ctx.bh = NULL; + } else { + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_data)->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr, + mapped_data); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/?)" + " !found in hash table, M.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr); + + ret = btrfsic_map_block(state, bytenr, len, &block_ctx, + 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + return; + } + } + block_ctx.data = mapped_data; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&block_ctx); + return; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io.bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New written %c-block @%llu (%s/%llu/%d)\n", + is_metadata ? 'M' : 'D', + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, + (struct btrfs_header *) + block_ctx.data, 0, 0); + if (ret) + printk(KERN_INFO + "btrfsic: process_metablock(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } +} + +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bio_error_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_bh_private; + bp->bi_end_io = block->orig_bio_bh_end_io.bio; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + bio_error_status, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long) + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp, bio_error_status); +} + +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; + int iodone_w_error = !uptodate; + struct btrfsic_dev_state *dev_state; + + BUG_ON(NULL == block); + dev_state = block->dev_state; + if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", + iodone_w_error, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long)dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is on disk */ + + bh->b_private = block->orig_bio_bh_private; + bh->b_end_io = block->orig_bio_bh_end_io.bh; + block->is_iodone = 1; /* for FLUSH, this releases the block */ + bh->b_end_io(bh, uptodate); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: superblock @%llu (%s/%llu/%d)" + " with old gen %llu <= %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: got new superblock @%llu (%s/%llu/%d)" + " with new gen %llu > %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_process_written_superblock(" + "mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { + WARN_ON(1); + btrfsic_dump_tree(state); + } + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + struct list_head *elem_ref_to; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage informations. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " %u* refers to %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " with generation %llu !=" + " parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)l->block_ref_to->generation, + (unsigned long long)l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not flushed out of disk's write cache" + " (block flush_gen=%llu," + " dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)block->flush_gen, + (unsigned long long) + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + struct list_head *elem_ref_from; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_from, &block->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, struct btrfsic_block_link, + node_ref_from); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " is ref %u* from %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Add %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Rem %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + struct list_head *elem_ref_to; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO + "btrfsic: error, kmalloc" " failed!\n"); + return NULL; + } + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return NULL; + } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + if (NULL == dev_state) { + printk(KERN_INFO + "btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New %s%c-block @%llu (%s/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char *data) +{ + int num_copies; + int mirror_num; + int ret; + struct btrfsic_block_data_ctx block_ctx; + int match = 0; + + num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + bytenr, PAGE_SIZE); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + &block_ctx, mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(logical @%llu," + " mirror %d) failed!\n", + (unsigned long long)bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (!match) { + printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," + " buffer->log_bytenr=%llu, submit_bio(bdev=%s," + " phys_bytenr=%llu)!\n", + (unsigned long long)bytenr, dev_state->name, + (unsigned long long)dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + &block_ctx, mirror_num); + if (ret) + continue; + + printk(KERN_INFO "Read logical bytenr @%llu maps to" + " (%s/%llu/%d)\n", + (unsigned long long)bytenr, + block_ctx.dev->name, + (unsigned long long)block_ctx.dev_bytenr, + mirror_num); + } + WARN_ON(1); + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev) +{ + struct btrfsic_dev_state *ds; + + ds = btrfsic_dev_state_hashtable_lookup(bdev, + &btrfsic_dev_state_hashtable); + return ds; +} + +int btrfsic_submit_bh(int rw, struct buffer_head *bh) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return submit_bh(rw, bh); + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bh() might also be called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + + /* Only called to write the superblock (incl. FLUSH/FUA) */ + if (NULL != dev_state && + (rw & WRITE) && bh->b_size > 0) { + u64 dev_bytenr; + + dev_bytenr = 4096 * bh->b_blocknr; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," + " size=%lu, data=%p, bdev=%p)\n", + rw, bh->b_blocknr, + (unsigned long long)dev_bytenr, bh->b_size, + bh->b_data, bh->b_bdev); + btrfsic_process_written_block(dev_state, dev_bytenr, + bh->b_data, bh->b_size, NULL, + NULL, bh, rw); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", + rw, bh->b_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bh(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + return submit_bh(rw, bh); +} + +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) { + submit_bio(rw, bio); + return; + } + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bio() is also called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + if (NULL != dev_state && + (rw & WRITE) && NULL != bio->bi_io_vec) { + unsigned int i; + u64 dev_bytenr; + int bio_is_patched; + + dev_bytenr = 512 * bio->bi_sector; + bio_is_patched = 0; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x, bi_vcnt=%u," + " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, bio->bi_sector, + (unsigned long long)dev_bytenr, + bio->bi_bdev); + + for (i = 0; i < bio->bi_vcnt; i++) { + u8 *mapped_data; + + mapped_data = kmap(bio->bi_io_vec[i].bv_page); + if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE) == + (dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "#%u: page=%p, mapped=%p, len=%u," + " offset=%u\n", + i, bio->bi_io_vec[i].bv_page, + mapped_data, + bio->bi_io_vec[i].bv_len, + bio->bi_io_vec[i].bv_offset); + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_data, + bio->bi_io_vec[i].bv_len, + bio, &bio_is_patched, + NULL, rw); + kunmap(bio->bi_io_vec[i].bv_page); + dev_bytenr += bio->bi_io_vec[i].bv_len; + } + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", + rw, bio->bi_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bio(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + + submit_bio(rw, bio); +} + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + state = kzalloc(sizeof(*state), GFP_NOFS); + if (NULL == state) { + printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); + return -1; + } + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->root = root; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->csum_size = 0; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + char *p; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + printk(KERN_INFO + "btrfs check-integrity: kmalloc() failed!\n"); + mutex_unlock(&btrfsic_mutex); + return -1; + } + ds->bdev = device->bdev; + ds->state = state; + bdevname(ds->bdev, ds->name); + ds->name[BDEVNAME_SIZE - 1] = '\0'; + for (p = ds->name; *p != '\0'; p++); + while (p > ds->name && *p != '/') + p--; + if (*p == '/') + p++; + strlcpy(ds->name, p, sizeof(ds->name)); + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(root, fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices) +{ + struct list_head *elem_all; + struct list_head *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + printk(KERN_INFO + "btrfsic: error, cannot find state information" + " on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { + struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + list_for_each_safe(elem_ref_to, tmp_ref_to, + &b_all->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone) + btrfsic_block_free(b_all); + else + printk(KERN_INFO "btrfs: attempt to free %c-block" + " @%llu (%s/%llu/%d) on umount which is" + " not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kfree(state); +} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 00000000000..8b59175cc50 --- /dev/null +++ b/fs/btrfs/check-integrity.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_CHECK_INTEGRITY__) +#define __BTRFS_CHECK_INTEGRITY__ + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +int btrfsic_submit_bh(int rw, struct buffer_head *bh); +void btrfsic_submit_bio(int rw, struct bio *bio); +#else +#define btrfsic_submit_bh submit_bh +#define btrfsic_submit_bio submit_bio +#endif + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices); + +#endif -- cgit v1.2.3 From c975dd469d748ce619c510050d4fb407c2398591 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 1 Nov 2011 17:06:04 +0100 Subject: Btrfs: add config option to enable btrfs integrity check Added the BTRFS_FS_CHECK_INTEGRITY option to Kconfig. It depends on BTRFS_FS. Signed-off-by: Stefan Behrens --- fs/btrfs/Kconfig | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ecb9fd3be14..d33f01c08b6 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL Linux website . If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + depends on BTRFS_FS + help + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N -- cgit v1.2.3 From f11e4d7f533249ddfa110116200c5c3a509f9218 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 1 Nov 2011 17:06:39 +0100 Subject: Btrfs: Makefile changes to optionally include btrfs integrity check If the btrfs integrity check is enabled, the files required to implement the checks are included in the build. Signed-off-by: Stefan Behrens --- fs/btrfs/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index c0ddfd29c5e..bc5b3556cee 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,3 +11,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ reada.o backref.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o -- cgit v1.2.3 From 21adbd5cbb5344a3fca6bb7ddb2ab6cb03c44546 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 9 Nov 2011 13:44:05 +0100 Subject: Btrfs: integrate integrity check module into btrfs This is the last part of the patch series. It modifies the btrfs code to use the integrity check module if configured to do so with the define BTRFS_FS_CHECK_INTEGRITY. If this define is not set, the only effective change is that code is added that handles the mount option to activate the integrity check. If the mount option is set and the define BTRFS_FS_CHECK_INTEGRITY is not set, that code complains in the log and the mount fails with EINVAL. Add the mount option to activate the usage of the integrity check code. Add invocation of btrfs integrity check code init and cleanup function on mount and umount, respectively. Add hook to call btrfs integrity check code version of submit_bh/submit_bio. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 8 +++++++- fs/btrfs/disk-io.c | 26 ++++++++++++++++++++++++-- fs/btrfs/extent_io.c | 5 +++-- fs/btrfs/scrub.c | 5 +++-- fs/btrfs/super.c | 39 ++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 7 ++++--- 6 files changed, 79 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323..39f6188688e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -971,7 +971,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:20; + unsigned long mount_opt:21; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; @@ -1155,6 +1155,10 @@ struct btrfs_fs_info { int scrub_workers_refcnt; struct btrfs_workers scrub_workers; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* filesystem state */ u64 fs_state; @@ -1413,6 +1417,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 19) +#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 20) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e58..f363c6d9c3d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,6 +43,7 @@ #include "tree-log.h" #include "free-space-cache.h" #include "inode-map.h" +#include "check-integrity.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -2001,6 +2002,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2356,6 +2360,19 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + ret = btrfsic_mount(tree_root, fs_devices, + btrfs_test_opt(tree_root, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? + 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + printk(KERN_WARNING "btrfs: failed to initialize" + " integrity check module %s\n", sb->s_id); + } +#endif + /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { @@ -2634,7 +2651,7 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(WRITE_FUA, bh); if (ret) errors++; } @@ -2711,7 +2728,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = bio; bio_get(bio); - submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(WRITE_FLUSH, bio); return 0; } @@ -3057,6 +3074,11 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY)) + btrfsic_unmount(root, fs_info->fs_devices); +#endif + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 49f3c9dc09f..246669296e0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -18,6 +18,7 @@ #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" +#include "check-integrity.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start-page_offset(page)); - submit_bio(WRITE_SYNC, bio); + btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ddf2c90d3fc..567e148caca 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "check-integrity.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -732,7 +733,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; bio->bi_private = &complete; - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); /* this will also unplug the queue */ wait_for_completion(&complete); @@ -958,7 +959,7 @@ static int scrub_submit(struct scrub_dev *sdev) sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, sbio->bio); + btrfsic_submit_bio(READ, sbio->bio); return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 34a8b6112ea..22a2015f1d7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -165,7 +165,10 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, + Opt_err, }; static match_table_t tokens = { @@ -200,6 +203,9 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, {Opt_err, NULL}, }; @@ -398,6 +404,37 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + printk(KERN_INFO "btrfs: enabling check integrity" + " including extent data\n"); + btrfs_set_opt(info->mount_opt, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->check_integrity_print_mask = intarg; + printk(KERN_INFO "btrfs:" + " check_integrity_print_mask 0x%x\n", + info->check_integrity_print_mask); + } + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + printk(KERN_ERR "btrfs: support for check_integrity*" + " not compiled in!\n"); + ret = -EINVAL; + goto out; +#endif case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..821334f6e3a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -32,6 +32,7 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" +#include "check-integrity.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -246,7 +247,7 @@ loop_lock: sync_pending = 0; } - submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; if (need_resched()) @@ -3304,7 +3305,7 @@ static noinline int schedule_bio(struct btrfs_root *root, /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); bio_put(bio); return 0; } @@ -3399,7 +3400,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (async_submit) schedule_bio(root, dev, rw, bio); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; -- cgit v1.2.3 From da5c81356426c476112f2b59fe64bdb1b37f079d Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 12:29:12 +0200 Subject: Btrfs: generic data structure to build unique lists ulist is a generic data structures to hold a collection of unique u64 values. The only operations it supports is adding to the list and enumerating it. It is possible to store an auxiliary value along with the key. The implementation is preliminary and can probably be sped up significantly. It is used by btrfs_find_all_roots() quota to translate recursions into iterative loops. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/Makefile | 2 +- fs/btrfs/ulist.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ulist.h | 68 +++++++++++++++++ 3 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 fs/btrfs/ulist.c create mode 100644 fs/btrfs/ulist.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index c0ddfd29c5e..70798407b9a 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,6 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o + reada.o backref.o ulist.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c new file mode 100644 index 00000000000..12f5147bd2b --- /dev/null +++ b/fs/btrfs/ulist.c @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen + * Distributed under the GNU GPL license version 2. + */ + +#include +#include +#include "ulist.h" + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * The implementation is preliminary and can probably be sped up + * significantly. A first step would be to store the values in an rbtree + * as soon as ULIST_SIZE is exceeded. + * + * A sample usage for ulists is the enumeration of directed graphs without + * visiting a node twice. The pseudo-code could look like this: + * + * ulist = ulist_alloc(); + * ulist_add(ulist, root); + * elem = NULL; + * + * while ((elem = ulist_next(ulist, elem)) { + * for (all child nodes n in elem) + * ulist_add(ulist, n); + * do something useful with the node; + * } + * ulist_free(ulist); + * + * This assumes the graph nodes are adressable by u64. This stems from the + * usage for tree enumeration in btrfs, where the logical addresses are + * 64 bit. + * + * It is also useful for tree enumeration which could be done elegantly + * recursively, but is not possible due to kernel stack limitations. The + * loop would be similar to the above. + */ + +/** + * ulist_init - freshly initialize a ulist + * @ulist: the ulist to initialize + * + * Note: don't use this function to init an already used ulist, use + * ulist_reinit instead. + */ +void ulist_init(struct ulist *ulist) +{ + ulist->nnodes = 0; + ulist->nodes = ulist->int_nodes; + ulist->nodes_alloced = ULIST_SIZE; +} +EXPORT_SYMBOL(ulist_init); + +/** + * ulist_fini - free up additionally allocated memory for the ulist + * @ulist: the ulist from which to free the additional memory + * + * This is useful in cases where the base 'struct ulist' has been statically + * allocated. + */ +void ulist_fini(struct ulist *ulist) +{ + /* + * The first ULIST_SIZE elements are stored inline in struct ulist. + * Only if more elements are alocated they need to be freed. + */ + if (ulist->nodes_alloced > ULIST_SIZE) + kfree(ulist->nodes); + ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ +} +EXPORT_SYMBOL(ulist_fini); + +/** + * ulist_reinit - prepare a ulist for reuse + * @ulist: ulist to be reused + * + * Free up all additional memory allocated for the list elements and reinit + * the ulist. + */ +void ulist_reinit(struct ulist *ulist) +{ + ulist_fini(ulist); + ulist_init(ulist); +} +EXPORT_SYMBOL(ulist_reinit); + +/** + * ulist_alloc - dynamically allocate a ulist + * @gfp_mask: allocation flags to for base allocation + * + * The allocated ulist will be returned in an initialized state. + */ +struct ulist *ulist_alloc(unsigned long gfp_mask) +{ + struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); + + if (!ulist) + return NULL; + + ulist_init(ulist); + + return ulist; +} +EXPORT_SYMBOL(ulist_alloc); + +/** + * ulist_free - free dynamically allocated ulist + * @ulist: ulist to free + * + * It is not necessary to call ulist_fini before. + */ +void ulist_free(struct ulist *ulist) +{ + if (!ulist) + return; + ulist_fini(ulist); + kfree(ulist); +} +EXPORT_SYMBOL(ulist_free); + +/** + * ulist_add - add an element to the ulist + * @ulist: ulist to add the element to + * @val: value to add to ulist + * @aux: auxiliary value to store along with val + * @gfp_mask: flags to use for allocation + * + * Note: locking must be provided by the caller. In case of rwlocks write + * locking is needed + * + * Add an element to a ulist. The @val will only be added if it doesn't + * already exist. If it is added, the auxiliary value @aux is stored along with + * it. In case @val already exists in the ulist, @aux is ignored, even if + * it differs from the already stored value. + * + * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been + * inserted. + * In case of allocation failure -ENOMEM is returned and the ulist stays + * unaltered. + */ +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long gfp_mask) +{ + int i; + + for (i = 0; i < ulist->nnodes; ++i) { + if (ulist->nodes[i].val == val) + return 0; + } + + if (ulist->nnodes >= ulist->nodes_alloced) { + u64 new_alloced = ulist->nodes_alloced + 128; + struct ulist_node *new_nodes; + void *old = NULL; + + /* + * if nodes_alloced == ULIST_SIZE no memory has been allocated + * yet, so pass NULL to krealloc + */ + if (ulist->nodes_alloced > ULIST_SIZE) + old = ulist->nodes; + + new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, + gfp_mask); + if (!new_nodes) + return -ENOMEM; + + if (!old) + memcpy(new_nodes, ulist->int_nodes, + sizeof(ulist->int_nodes)); + + ulist->nodes = new_nodes; + ulist->nodes_alloced = new_alloced; + } + ulist->nodes[ulist->nnodes].val = val; + ulist->nodes[ulist->nnodes].aux = aux; + ++ulist->nnodes; + + return 1; +} +EXPORT_SYMBOL(ulist_add); + +/** + * ulist_next - iterate ulist + * @ulist: ulist to iterate + * @prev: previously returned element or %NULL to start iteration + * + * Note: locking must be provided by the caller. In case of rwlocks only read + * locking is needed + * + * This function is used to iterate an ulist. The iteration is started with + * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * end is reached. No guarantee is made with respect to the order in which + * the elements are returned. They might neither be returned in order of + * addition nor in ascending order. + * It is allowed to call ulist_add during an enumeration. Newly added items + * are guaranteed to show up in the running enumeration. + */ +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +{ + int next; + + if (ulist->nnodes == 0) + return NULL; + + if (!prev) + return &ulist->nodes[0]; + + next = (prev - ulist->nodes) + 1; + if (next < 0 || next >= ulist->nnodes) + return NULL; + + return &ulist->nodes[next]; +} +EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h new file mode 100644 index 00000000000..2e25dec58ec --- /dev/null +++ b/fs/btrfs/ulist.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen + * Distributed under the GNU GPL license version 2. + * + */ + +#ifndef __ULIST__ +#define __ULIST__ + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * The implementation is preliminary and can probably be sped up + * significantly. A first step would be to store the values in an rbtree + * as soon as ULIST_SIZE is exceeded. + */ + +/* + * number of elements statically allocated inside struct ulist + */ +#define ULIST_SIZE 16 + +/* + * element of the list + */ +struct ulist_node { + u64 val; /* value to store */ + unsigned long aux; /* auxiliary value saved along with the val */ +}; + +struct ulist { + /* + * number of elements stored in list + */ + unsigned long nnodes; + + /* + * number of nodes we already have room for + */ + unsigned long nodes_alloced; + + /* + * pointer to the array storing the elements. The first ULIST_SIZE + * elements are stored inline. In this case the it points to int_nodes. + * After exceeding ULIST_SIZE, dynamic memory is allocated. + */ + struct ulist_node *nodes; + + /* + * inline storage space for the first ULIST_SIZE entries + */ + struct ulist_node int_nodes[ULIST_SIZE]; +}; + +void ulist_init(struct ulist *ulist); +void ulist_fini(struct ulist *ulist); +void ulist_reinit(struct ulist *ulist); +struct ulist *ulist_alloc(unsigned long gfp_mask); +void ulist_free(struct ulist *ulist); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); + +#endif -- cgit v1.2.3 From c7d22a3c3cdb73d8a0151e2ccc8cf4a48c48310b Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 22 Nov 2011 15:14:33 +0100 Subject: Btrfs: added helper btrfs_next_item() btrfs_next_item() makes the btrfs path point to the next item, crossing leaf boundaries if needed. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 50634abef9b..3e4a07b7981 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2482,6 +2482,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + ++p->slots[0]; + if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) + return btrfs_next_leaf(root, p); + return 0; +} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); void btrfs_drop_snapshot(struct btrfs_root *root, -- cgit v1.2.3 From 66d7e7f09f77456fe68683247d77721032a00ee5 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 12 Sep 2011 15:26:38 +0200 Subject: Btrfs: mark delayed refs as for cow Add a for_cow parameter to add_delayed_*_ref and pass the appropriate value from every call site. The for_cow parameter will later on be used to determine if a ref will change anything with respect to qgroups. Delayed refs coming from relocation are always counted as for_cow, as they don't change subvol quota. Also pass in the fs_info for later use. btrfs_find_all_roots() will use this as an optimization, as changes that are for_cow will not change anything with respect to which root points to a certain leaf. Thus, we don't need to add the current sequence number to those delayed refs. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 42 ++++++++++---------- fs/btrfs/ctree.h | 17 +++++---- fs/btrfs/delayed-ref.c | 50 ++++++++++++++---------- fs/btrfs/delayed-ref.h | 15 +++++--- fs/btrfs/disk-io.c | 3 +- fs/btrfs/extent-tree.c | 101 +++++++++++++++++++++++++++++-------------------- fs/btrfs/file.c | 10 ++--- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 5 ++- fs/btrfs/relocation.c | 18 +++++---- fs/btrfs/transaction.c | 4 +- fs/btrfs/tree-log.c | 2 +- 12 files changed, 155 insertions(+), 114 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dede441bdee..0639a555e16 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0); + buf->start, 0, 1); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); if (ret) return ret; @@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); + ret = btrfs_inc_ref(trans, root, buf, 1, 1); BUG_ON(ret); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); + ret = btrfs_dec_ref(trans, root, buf, 0, 1); BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); BUG_ON(ret); } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); BUG_ON(ret); } if (new_flags != 0) { @@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1); + ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); } clean_tree_block(trans, root, buf); @@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, 1); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + last_ref, 1); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + last_ref, 1); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ free_extent_buffer(mid); return 0; @@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1); + btrfs_free_tree_block(trans, root, right, 0, 1, 0); free_extent_buffer(right); right = NULL; } else { @@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, root, mid, 0, 1, 0); free_extent_buffer(mid); mid = NULL; } else { @@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0); + level, root->node->start, 0, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + &disk_key, level, c->start, 0, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -2970,7 +2970,7 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + &disk_key, 0, l->start, 0, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); return 0; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3e4a07b7981..543f60bddb3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2277,11 +2277,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size); + u64 hint, u64 empty_size, int for_cow); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref); + u64 parent, int last_ref, int for_cow); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2301,17 +2301,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int for_cow); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, @@ -2323,7 +2323,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 root_objectid, u64 owner, u64 offset, int for_cow); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -2492,7 +2492,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref); + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 125cf76fcd0..3a0f0ab804f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -390,7 +390,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, +static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, int action, int is_data) @@ -468,10 +469,12 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, +static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action) + u64 ref_root, int level, int action, + int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -522,11 +525,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, +static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, - int action) + int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -554,6 +558,7 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, full_ref->root = ref_root; ref->type = BTRFS_EXTENT_DATA_REF_KEY; } + full_ref->objectid = owner; full_ref->offset = offset; @@ -580,10 +585,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int for_cow) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -610,12 +617,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 0); + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, action, 0); BUG_ON(ret); - ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, level, action); + ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + num_bytes, parent, ref_root, level, action, + for_cow); BUG_ON(ret); spin_unlock(&delayed_refs->lock); return 0; @@ -624,11 +632,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int for_cow) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -655,18 +665,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 1); + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, action, 1); BUG_ON(ret); - ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, owner, offset, action); + ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + num_bytes, parent, ref_root, owner, offset, + action, for_cow); BUG_ON(ret); spin_unlock(&delayed_refs->lock); return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { @@ -683,7 +695,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); BUG_ON(ret); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e287e3b0eab..8316bff18d3 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -151,16 +151,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int for_cow); +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int for_cow); +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 94abc25392f..6f8cd17c9a9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1243,7 +1243,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + BTRFS_TREE_LOG_OBJECTID, NULL, + 0, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813c6bb96c9..dc8b9a83459 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + u64 root_objectid, u64 owner, u64 offset, int for_cow) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, for_cow); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, for_cow); } return ret; } @@ -2405,7 +2409,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, + num_bytes, extent_op); if (ret) kfree(extent_op); return ret; @@ -2590,7 +2595,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + int full_backref, int inc, int for_cow) { u64 bytenr; u64 num_bytes; @@ -2603,7 +2608,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, int); ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); @@ -2640,14 +2645,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset); + key.offset, for_cow); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + parent, ref_root, level - 1, 0, + for_cow); if (ret) goto fail; } @@ -2659,15 +2665,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int for_cow) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int for_cow) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -4937,16 +4943,17 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref) + u64 parent, int last_ref, int for_cow) { struct btrfs_block_group_cache *cache = NULL; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, - parent, root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL, for_cow); BUG_ON(ret); } @@ -4981,12 +4988,12 @@ out: btrfs_put_block_group(cache); } -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int for_cow) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; /* * tree log blocks never actually go into the extent allocation @@ -4998,14 +5005,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, for_cow); BUG_ON(ret); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, + NULL, for_cow); BUG_ON(ret); } return ret; @@ -5826,9 +5836,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, - 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL); + ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, + ins->offset, 0, + root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL, 0); return ret; } @@ -5998,7 +6009,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size) + u64 hint, u64 empty_size, int for_cow) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6042,10 +6053,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->is_data = 0; - ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, for_cow); BUG_ON(ret); } return buf; @@ -6062,6 +6074,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int for_reloc; }; #define DROP_REFERENCE 1 @@ -6200,9 +6213,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); + ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); @@ -6346,7 +6359,7 @@ skip: } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); + root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); } btrfs_tree_unlock(next); @@ -6420,9 +6433,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1); + ret = btrfs_dec_ref(trans, root, eb, 1, + wc->for_reloc); else - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, 0, + wc->for_reloc); BUG_ON(ret); } /* make block locked assertion in clean_tree_block happy */ @@ -6449,7 +6464,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -6533,7 +6548,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * blocks are properly updated. */ void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref) + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6621,6 +6637,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->for_reloc = for_reloc; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6705,6 +6722,7 @@ out: * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -6749,6 +6767,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->for_reloc = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f2e92828960..d2b60ed6c33 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -678,7 +678,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset); + start - extent_offset, 0); BUG_ON(ret); *hint_byte = disk_bytenr; } @@ -753,7 +753,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset); + extent_offset, 0); BUG_ON(ret); inode_sub_bytes(inode, extent_end - key.offset); @@ -962,7 +962,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); if (split == start) { @@ -989,7 +989,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); } other_start = 0; @@ -1006,7 +1006,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); } if (del_nr == 0) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c5ccec23984..ea819386b86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3139,7 +3139,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset); + ino, extent_offset, 0); BUG_ON(ret); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 72d461656f6..c48f2e931ea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -358,7 +358,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -2425,7 +2425,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao); + new_key.offset - datao, + 0); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index dff29d5e151..8c1aae2c845 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, 1); BUG_ON(ret); } if (dirty) @@ -1778,21 +1778,23 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2244,7 +2246,7 @@ again: } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); } if (found) { @@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0); + node->level, 0, 1); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3..a2bfedcbcab 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, NULL, 0); + btrfs_drop_snapshot(root, NULL, 0, 0); else - btrfs_drop_snapshot(root, NULL, 1); + btrfs_drop_snapshot(root, NULL, 1, 0); } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f4d81c06d48..fce7b9ec3bf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0); BUG_ON(ret); } else { /* -- cgit v1.2.3 From eebe063b7f916087cd5c61de57b20a3a30894a96 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 14 Sep 2011 14:01:24 +0200 Subject: Btrfs: always save ref_root in delayed refs For consistent backref walking and (later) qgroup calculation the information to which root a delayed ref belongs is useful even for shared refs. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/delayed-ref.c | 18 ++++++++---------- fs/btrfs/delayed-ref.h | 12 ++++-------- 2 files changed, 12 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 3a0f0ab804f..babd37badb4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -495,13 +495,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->in_tree = 1; full_ref = btrfs_delayed_node_to_tree_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_TREE_BLOCK_REF_KEY; - } full_ref->level = level; trace_btrfs_delayed_tree_ref(ref, full_ref, action); @@ -551,13 +550,12 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->in_tree = 1; full_ref = btrfs_delayed_node_to_data_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_DATA_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_EXTENT_DATA_REF_KEY; - } full_ref->objectid = owner; full_ref->offset = offset; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 8316bff18d3..a5fb2bc8373 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -98,19 +98,15 @@ struct btrfs_delayed_ref_head { struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; int level; }; struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; u64 objectid; u64 offset; }; -- cgit v1.2.3 From a561be7100cd610bd2e082f3211c1dfb45835817 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Nov 2011 11:57:51 -0500 Subject: switch a bunch of places to mnt_want_write_file() it's both faster (in case when file has been opened for write) and cleaner. Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c7d5b..20dd8f3b6c7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -201,7 +201,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) goto out_unlock; @@ -1855,7 +1855,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out; @@ -1987,7 +1987,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -2195,7 +2195,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -2549,7 +2549,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (btrfs_root_readonly(root)) goto out; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) goto out; -- cgit v1.2.3 From e407699ef56ed948739dd57a5578ba8cb5bd81b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 20:14:54 -0500 Subject: btrfs, nfs, apparmor: don't pull mnt_namespace.h for no reason... it's not needed anymore; we used to, back when we had to do mount_subtree() by hand, complete with put_mnt_ns() in it. No more... Apparmor didn't need it since the __d_path() fix. Signed-off-by: Al Viro --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 200f63bc667..dc62d3cc68f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include "compat.h" #include "delayed-inode.h" -- cgit v1.2.3 From 2a79f17e4a641a2f463cb512cb0ec349844a147b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 9 Dec 2011 08:06:57 -0500 Subject: vfs: mnt_drop_write_file() new helper (wrapper around mnt_drop_write()) to be used in pair with mnt_want_write_file(). Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 20dd8f3b6c7..5441ff1480f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -259,7 +259,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_end_transaction(trans, root); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); ret = 0; out_unlock: @@ -1971,7 +1971,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: kfree(vol_args); return err; @@ -2040,7 +2040,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -2510,7 +2510,7 @@ out_unlock: out_fput: fput(src_file); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -2565,7 +2565,7 @@ static long btrfs_ioctl_trans_start(struct file *file) out_drop: atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: return ret; } @@ -2800,7 +2800,7 @@ long btrfs_ioctl_trans_end(struct file *file) atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return 0; } -- cgit v1.2.3 From 6b520e0565422966cdf1c3759bd73df77b0f248c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Dec 2011 15:51:45 -0500 Subject: vfs: fix the stupidity with i_dentry in inode destructors Seeing that just about every destructor got that INIT_LIST_HEAD() copied into it, there is no point whatsoever keeping this INIT_LIST_HEAD in inode_init_once(); the cost of taking it into inode_init_always() will be negligible for pipes and sockets and negative for everything else. Not to mention the removal of boilerplate code from ->destroy_inode() instances... Signed-off-by: Al Viro --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fd1a06df5bc..f8ff9738558 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6761,7 +6761,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) static void btrfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -- cgit v1.2.3 From 18bb1db3e7607e4a997d50991a6f9fa5b0f8722c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:41:39 -0400 Subject: switch vfs_mkdir() and ->mkdir() to umode_t vfs_mkdir() gets int, but immediately drops everything that might not fit into umode_t and that's the only caller of ->mkdir()... Signed-off-by: Al Viro --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f8ff9738558..e30de56e6b6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4792,7 +4792,7 @@ fail: return err; } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; -- cgit v1.2.3 From 4acdaf27ebe2034c342f3be57ef49aed1ad885ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:42:34 -0400 Subject: switch ->create() to umode_t vfs_create() ignores everything outside of 16bit subset of its mode argument; switching it to umode_t is obviously equivalent and it's the only caller of the method Signed-off-by: Al Viro --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e30de56e6b6..19630aacb32 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4665,7 +4665,7 @@ out_unlock: } static int btrfs_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; -- cgit v1.2.3 From 1a67aafb5f72a436ca044293309fa7e6351d6a35 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:52:52 -0400 Subject: switch ->mknod() to umode_t Signed-off-by: Al Viro --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 19630aacb32..0060875d6af 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4596,7 +4596,7 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, } static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; -- cgit v1.2.3 From 175a4eb7ea531cdbf6d574f5d5ba9aa0f5e8ed13 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 03:30:54 -0400 Subject: fs: propagate umode_t, misc bits Signed-off-by: Al Viro --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0060875d6af..2f426a51e60 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4412,8 +4412,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, const char *name, int name_len, - u64 ref_objectid, u64 objectid, int mode, - u64 *index) + u64 ref_objectid, u64 objectid, + umode_t mode, u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; -- cgit v1.2.3 From 5b25f70f4200766355cdabda604e131d2fb6010d Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 10:55:48 +0200 Subject: Btrfs: add nested locking mode for paths This patch adds the possibilty to read-lock an extent even if it is already write-locked from the same thread. btrfs_find_all_roots() needs this capability. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/extent_io.c | 1 + fs/btrfs/extent_io.h | 2 ++ fs/btrfs/locking.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 54 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index be1bf627a14..dd8d140eb27 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3571,6 +3571,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->blocking_writers, 0); atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->spinning_writers, 0); + eb->lock_nested = 0; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7604c300132..bc6a042cb6f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -129,6 +129,7 @@ struct extent_buffer { struct list_head leak_list; struct rcu_head rcu_head; atomic_t refs; + pid_t lock_owner; /* count of read lock holders on the extent buffer */ atomic_t write_locks; @@ -137,6 +138,7 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; + int lock_nested; /* protects write locks */ rwlock_t lock; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d77b67c4b27..5e178d8f716 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (&eb->lock_nested && current->pid == eb->lock_owner) { + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. We allow + * an additional read lock to be added because it's for the same + * thread. btrfs_find_all_roots() depends on this as it may be + * called on a partly (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = 1; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); @@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) } atomic_inc(&eb->write_locks); atomic_inc(&eb->spinning_writers); + eb->lock_owner = current->pid; return 1; } @@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); atomic_dec(&eb->spinning_readers); @@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); if (atomic_dec_and_test(&eb->blocking_readers)) @@ -181,6 +229,7 @@ again: WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); + eb->lock_owner = current->pid; return 0; } -- cgit v1.2.3 From 00f04b88791ff49dc64ada18819d40a5b0671709 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 14 Sep 2011 12:37:00 +0200 Subject: Btrfs: add sequence numbers to delayed refs Sequence numbers are needed to reconstruct the backrefs of a given extent to a certain point in time. The total set of backrefs consist of the set of backrefs recorded on disk plus the enqueued delayed refs for it that existed at that moment. This patch also adds a list that records all delayed refs which are currently in the process of being added. When walking all refs of an extent in btrfs_find_all_roots(), we freeze the current state of delayed refs, honor anythinh up to this point and prevent processing newer delayed refs to assert consistency. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/delayed-ref.c | 34 ++++++++++++++++++++++++ fs/btrfs/delayed-ref.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.c | 4 +++ 3 files changed, 108 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index babd37badb4..a405db0320e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + /* merging of sequenced refs is not allowed */ + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), @@ -209,6 +214,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } +int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +{ + struct seq_list *elem; + + assert_spin_locked(&delayed_refs->lock); + if (list_empty(&delayed_refs->seq_head)) + return 0; + + elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", + seq, elem->seq, delayed_refs); + return 1; + } + return 0; +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 start) { @@ -438,6 +461,7 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, ref->action = 0; ref->is_head = 1; ref->in_tree = 1; + ref->seq = 0; head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; @@ -479,6 +503,7 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -494,6 +519,10 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; + if (need_ref_seq(for_cow, ref_root)) + seq = inc_delayed_seq(delayed_refs); + ref->seq = seq; + full_ref = btrfs_delayed_node_to_tree_ref(ref); full_ref->parent = parent; full_ref->root = ref_root; @@ -534,6 +563,7 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -549,6 +579,10 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; + if (need_ref_seq(for_cow, ref_root)) + seq = inc_delayed_seq(delayed_refs); + ref->seq = seq; + full_ref = btrfs_delayed_node_to_data_ref(ref); full_ref->parent = parent; full_ref->root = ref_root; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a5fb2bc8373..174416f7882 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node { /* the size of the extent */ u64 num_bytes; + /* seq number to keep track of insertion order */ + u64 seq; + /* ref count on this data structure */ atomic_t refs; @@ -136,6 +139,20 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * seq number of delayed refs. We need to know if a backref was being + * added before the currently processed ref or afterwards. + */ + u64 seq; + + /* + * seq_list holds a list of all seq numbers that are currently being + * added to the list. While walking backrefs (btrfs_find_all_roots, + * qgroups), which might take some time, no newer ref must be processed, + * as it might influence the outcome of the walk. + */ + struct list_head seq_head; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -171,6 +188,59 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); + +struct seq_list { + struct list_head list; + u64 seq; +}; + +static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) +{ + assert_spin_locked(&delayed_refs->lock); + ++delayed_refs->seq; + return delayed_refs->seq; +} + +static inline void +btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + struct seq_list *elem) +{ + assert_spin_locked(&delayed_refs->lock); + elem->seq = delayed_refs->seq; + list_add_tail(&elem->list, &delayed_refs->seq_head); +} + +static inline void +btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + struct seq_list *elem) +{ + spin_lock(&delayed_refs->lock); + list_del(&elem->list); + spin_unlock(&delayed_refs->lock); +} + +int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + u64 seq); + +/* + * delayed refs with a ref_seq > 0 must be held back during backref walking. + * this only applies to items in one of the fs-trees. for_cow items never need + * to be held back, so they won't get a ref_seq number. + */ +static inline int need_ref_seq(int for_cow, u64 rootid) +{ + if (for_cow) + return 0; + + if (rootid == BTRFS_FS_TREE_OBJECTID) + return 1; + + if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + + return 0; +} + /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a2bfedcbcab..31a7393af64 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); + WARN_ON(transaction->delayed_refs.root.rb_node); + WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -108,8 +110,10 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.seq = 1; spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); -- cgit v1.2.3 From d1270cd91f308c9d22b2804720c36ccd32dbc35e Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Tue, 13 Sep 2011 15:16:43 +0200 Subject: Btrfs: put back delayed refs that are too new When processing a delayed ref, first check if there are still old refs in the process of being added. If so, put this ref back to the tree. To avoid looping on this ref, choose a newer one in the next loop. btrfs_find_ref_cluster has to take care of that. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/delayed-ref.c | 43 +++++++++++++++++++++++++------------------ fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a405db0320e..ee181989d44 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -155,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, /* * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot + * head if it was able to find one, or NULL if nothing was in that spot. + * If return_bigger is given, the next bigger entry is returned if no exact + * match is found. */ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, u64 bytenr, - struct btrfs_delayed_ref_node **last) + struct btrfs_delayed_ref_node **last, + int return_bigger) { - struct rb_node *n = root->rb_node; + struct rb_node *n; struct btrfs_delayed_ref_node *entry; - int cmp; + int cmp = 0; +again: + n = root->rb_node; + entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); @@ -187,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, else return entry; } + if (entry && return_bigger) { + if (cmp > 0) { + n = rb_next(&entry->rb_node); + if (!n) + n = rb_first(root); + entry = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + bytenr = entry->bytenr; + return_bigger = 0; + goto again; + } + return entry; + } return NULL; } @@ -246,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - find_ref_head(&delayed_refs->root, start, &ref); + find_ref_head(&delayed_refs->root, start + 1, &ref, 1); if (ref) { - struct btrfs_delayed_ref_node *tmp; - - node = rb_prev(&ref->rb_node); - while (node) { - tmp = rb_entry(node, - struct btrfs_delayed_ref_node, - rb_node); - if (tmp->bytenr < start) - break; - ref = tmp; - node = rb_prev(&ref->rb_node); - } node = &ref->rb_node; } else node = rb_first(&delayed_refs->root); @@ -748,7 +755,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc8b9a83459..bbcca12fbbb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2236,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } } + /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + + if (ref && ref->seq && + btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + /* + * there are still refs with lower seq numbers in the + * process of being added. Don't run this ref yet. + */ + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + locked_ref = NULL; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + /* * record the must insert reserved flag before we * drop the spin lock. @@ -2246,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); if (!ref) { /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, -- cgit v1.2.3 From a168650c08300434e1456abe7b6451f1448230d3 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 12 Dec 2011 16:10:07 +0100 Subject: Btrfs: add waitqueue instead of doing busy waiting for more delayed refs Now that we may be holding back delayed refs for a limited period, we might end up having no runnable delayed refs. Without this commit, we'd do busy waiting in that thread until another (runnable) ref arives. Instead, we're detecting this situation and use a waitqueue, such that we only try to run more refs after a) another runnable ref was added or b) delayed refs are no longer held back Signed-off-by: Jan Schmidt --- fs/btrfs/delayed-ref.c | 8 +++++++ fs/btrfs/delayed-ref.h | 7 ++++++ fs/btrfs/extent-tree.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/transaction.c | 1 + 4 files changed, 74 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ee181989d44..66e4f29505a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -664,6 +664,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, num_bytes, parent, ref_root, level, action, for_cow); BUG_ON(ret); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -712,6 +715,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, num_bytes, parent, ref_root, owner, offset, action, for_cow); BUG_ON(ret); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -739,6 +745,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, extent_op->is_data); BUG_ON(ret); + if (waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 174416f7882..d8f244d9492 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -153,6 +153,12 @@ struct btrfs_delayed_ref_root { * as it might influence the outcome of the walk. */ struct list_head seq_head; + + /* + * when the only refs we have in the list must not be processed, we want + * to wait for more refs to show up or for the end of backref walking. + */ + wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -216,6 +222,7 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, { spin_lock(&delayed_refs->lock); list_del(&elem->list); + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bbcca12fbbb..0a435e2e143 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2300,7 +2300,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + /* + * we modified num_entries, but as we're currently running + * delayed refs, skip + * wake_up(&delayed_refs->seq_wait); + * here. + */ spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, @@ -2317,6 +2322,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, return count; } + +static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, + unsigned long num_refs) +{ + struct list_head *first_seq = delayed_refs->seq_head.next; + + spin_unlock(&delayed_refs->lock); + pr_debug("waiting for more refs (num %ld, first %p)\n", + num_refs, first_seq); + wait_event(delayed_refs->seq_wait, + num_refs != delayed_refs->num_entries || + delayed_refs->seq_head.next != first_seq); + pr_debug("done waiting for more refs (num %ld, first %p)\n", + delayed_refs->num_entries, delayed_refs->seq_head.next); + spin_lock(&delayed_refs->lock); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2332,8 +2354,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct list_head cluster; int ret; + u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; + unsigned long num_refs = 0; + int consider_waiting; if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2341,6 +2366,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: + consider_waiting = 0; spin_lock(&delayed_refs->lock); if (count == 0) { count = delayed_refs->num_entries * 2; @@ -2357,11 +2383,35 @@ again: * of refs to process starting at the first one we are able to * lock */ + delayed_start = delayed_refs->run_delayed_start; ret = btrfs_find_ref_cluster(trans, &cluster, delayed_refs->run_delayed_start); if (ret) break; + if (delayed_start >= delayed_refs->run_delayed_start) { + if (consider_waiting == 0) { + /* + * btrfs_find_ref_cluster looped. let's do one + * more cycle. if we don't run any delayed ref + * during that cycle (because we can't because + * all of them are blocked) and if the number of + * refs doesn't change, we avoid busy waiting. + */ + consider_waiting = 1; + num_refs = delayed_refs->num_entries; + } else { + wait_for_more_refs(delayed_refs, num_refs); + /* + * after waiting, things have changed. we + * dropped the lock and someone else might have + * run some refs, built new clusters and so on. + * therefore, we restart staleness detection. + */ + consider_waiting = 0; + } + } + ret = run_clustered_refs(trans, root, &cluster); BUG_ON(ret < 0); @@ -2369,6 +2419,11 @@ again: if (count == 0) break; + + if (ret || delayed_refs->run_delayed_start == 0) { + /* refs were run, let's reset staleness detection */ + consider_waiting = 0; + } } if (run_all) { @@ -4933,6 +4988,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; + if (waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); /* * we don't take a ref on the node because we're removing it from the diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 31a7393af64..04c5c7c2c32 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -111,6 +111,7 @@ loop: cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; cur_trans->delayed_refs.seq = 1; + init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); -- cgit v1.2.3 From 8da6d5815c592b713ecaf4f4f8b631f8359c96c4 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 23 Nov 2011 18:55:04 +0100 Subject: Btrfs: added btrfs_find_all_roots() This function gets a byte number (a data extent), collects all the leafs pointing to it and walks up the trees to find all fs roots pointing to those leafs. It also returns the list of all leafs pointing to that extent. It does proper locking for the involved trees, can be used on busy file systems and honors delayed refs. Signed-off-by: Arne Jansen Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 783 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/backref.h | 5 + 2 files changed, 788 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22c64fff1bd..03c30a1836f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -19,6 +19,9 @@ #include "ctree.h" #include "disk-io.h" #include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" struct __data_ref { struct list_head list; @@ -32,6 +35,786 @@ struct __shared_ref { u64 disk_byte; }; +/* + * this structure records all encountered refs on the way up to the root + */ +struct __prelim_ref { + struct list_head list; + u64 root_id; + struct btrfs_key key; + int level; + int count; + u64 parent; + u64 wanted_disk_byte; +}; + +static int __add_prelim_ref(struct list_head *head, u64 root_id, + struct btrfs_key *key, int level, u64 parent, + u64 wanted_disk_byte, int count) +{ + struct __prelim_ref *ref; + + /* in case we're adding delayed refs, we're holding the refs spinlock */ + ref = kmalloc(sizeof(*ref), GFP_ATOMIC); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key = *key; + else + memset(&ref->key, 0, sizeof(ref->key)); + + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, + struct extent_buffer *eb, int level, + u64 wanted_objectid, u64 wanted_disk_byte) +{ + int ret; + int slot; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_byte; + +add_parent: + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + + if (level != 0) + return 0; + + /* + * if the current leaf is full with EXTENT_DATA items, we must + * check the next one if that holds a reference as well. + * ref->count cannot be used to skip this check. + * repeat this until we don't find any additional EXTENT_DATA items. + */ + while (1) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret) + return 0; + + eb = path->nodes[0]; + for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.objectid != wanted_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + fi = btrfs_item_ptr(eb, slot, + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == wanted_disk_byte) + goto add_parent; + } + } + + return 0; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + struct __prelim_ref *ref, + struct ulist *parents) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_key root_key; + struct btrfs_key key = {0}; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root_key.objectid = ref->root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + rcu_read_lock(); + root_level = btrfs_header_level(root->node); + rcu_read_unlock(); + + if (root_level + 1 == level) + goto out; + + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " + "%d for key (%llu %u %llu)\n", + (unsigned long long)ref->root_id, level, ref->count, ret, + (unsigned long long)ref->key.objectid, ref->key.type, + (unsigned long long)ref->key.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + if (!eb) { + WARN_ON(1); + ret = 1; + goto out; + } + + if (level == 0) { + if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + } + + /* the last two parameters will only be used for level == 0 */ + ret = add_all_parents(root, path, parents, eb, level, key.objectid, + ref->wanted_disk_byte); +out: + btrfs_free_path(path); + return ret; +} + +/* + * resolve all indirect backrefs from the list + */ +static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + int err; + int ret = 0; + struct __prelim_ref *ref; + struct __prelim_ref *ref_safe; + struct __prelim_ref *new_ref; + struct ulist *parents; + struct ulist_node *node; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * _safe allows us to insert directly after the current item without + * iterating over the newly inserted items. + * we're also allowed to re-assign ref during iteration. + */ + list_for_each_entry_safe(ref, ref_safe, head, list) { + if (ref->parent) /* already direct */ + continue; + if (ref->count == 0) + continue; + err = __resolve_indirect_ref(fs_info, ref, parents); + if (err) { + if (ret == 0) + ret = err; + continue; + } + + /* we put the first parent into the ref at hand */ + node = ulist_next(parents, NULL); + ref->parent = node ? node->val : 0; + + /* additional parents require new refs being added here */ + while ((node = ulist_next(parents, node))) { + new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); + if (!new_ref) { + ret = -ENOMEM; + break; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + list_add(&new_ref->list, &ref->list); + } + ulist_reinit(parents); + } + + ulist_free(parents); + return ret; +} + +/* + * merge two lists of backrefs and adjust counts accordingly + * + * mode = 1: merge identical keys, if key is set + * mode = 2: merge identical parents + */ +static int __merge_refs(struct list_head *head, int mode) +{ + struct list_head *pos1; + + list_for_each(pos1, head) { + struct list_head *n2; + struct list_head *pos2; + struct __prelim_ref *ref1; + + ref1 = list_entry(pos1, struct __prelim_ref, list); + + if (mode == 1 && ref1->key.type == 0) + continue; + for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; + pos2 = n2, n2 = pos2->next) { + struct __prelim_ref *ref2; + + ref2 = list_entry(pos2, struct __prelim_ref, list); + + if (mode == 1) { + if (memcmp(&ref1->key, &ref2->key, + sizeof(ref1->key)) || + ref1->level != ref2->level || + ref1->root_id != ref2->root_id) + continue; + ref1->count += ref2->count; + } else { + if (ref1->parent != ref2->parent) + continue; + ref1->count += ref2->count; + } + list_del(&ref2->list); + kfree(ref2); + } + + } + return 0; +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, + struct btrfs_key *info_key, + struct list_head *prefs) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + struct rb_node *n = &head->node.rb_node; + int sgn; + int ret; + + if (extent_op && extent_op->update_key) + btrfs_disk_key_to_cpu(info_key, &extent_op->key); + + while ((n = rb_prev(n))) { + struct btrfs_delayed_ref_node *node; + node = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + if (node->bytenr != head->node.bytenr) + break; + WARN_ON(node->is_head); + + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + default: + BUG_ON(1); + } + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, info_key, + ref->level + 1, 0, node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, info_key, + ref->level + 1, ref->parent, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + struct btrfs_key key; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + struct btrfs_key key; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ref->parent, node->bytenr, + node->ref_mod * sgn); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return 0; +} + +/* + * add all inline backrefs for bytenr to the list + */ +static int __add_inline_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + struct btrfs_key *info_key, int *info_level, + struct list_head *prefs) +{ + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0] - 1; + + item_size = btrfs_item_size_nr(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + struct btrfs_disk_key disk_key; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + btrfs_tree_block_key(leaf, info, &disk_key); + btrfs_disk_key_to_cpu(info_key, &disk_key); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, info_key, + *info_level + 1, offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, offset, info_key, + *info_level + 1, 0, bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, + count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + */ +static int __add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + struct btrfs_key *info_key, int info_level, + struct list_head *prefs) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, info_key, + info_level + 1, key.offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, key.offset, info_key, + info_level + 1, 0, bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return ret; +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 seq, struct ulist *refs, struct ulist *roots) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_key info_key = { 0 }; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head = NULL; + int info_level = 0; + int ret; + struct list_head prefs_delayed; + struct list_head prefs; + struct __prelim_ref *ref; + + INIT_LIST_HEAD(&prefs); + INIT_LIST_HEAD(&prefs_delayed); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * grab both a lock on the path and a lock on the delayed ref head. + * We need both to get a consistent picture of how the refs look + * at a specified point in time + */ +again: + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + /* + * look if there are updates for this ref queued and lock the head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); + if (ret) + goto out; + } + spin_unlock(&delayed_refs->lock); + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = __add_inline_refs(fs_info, path, bytenr, + &info_key, &info_level, &prefs); + if (ret) + goto out; + ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, + info_level, &prefs); + if (ret) + goto out; + } + } + btrfs_release_path(path); + + /* + * when adding the delayed refs above, the info_key might not have + * been known yet. Go over the list and replace the missing keys + */ + list_for_each_entry(ref, &prefs_delayed, list) { + if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) + memcpy(&ref->key, &info_key, sizeof(ref->key)); + } + list_splice_init(&prefs_delayed, &prefs); + + ret = __merge_refs(&prefs, 1); + if (ret) + goto out; + + ret = __resolve_indirect_refs(fs_info, &prefs); + if (ret) + goto out; + + ret = __merge_refs(&prefs, 2); + if (ret) + goto out; + + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + if (ref->count < 0) + WARN_ON(1); + if (ref->count && ref->root_id && ref->parent == 0) { + /* no parent == root of tree */ + ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + if (ref->count && ref->parent) { + ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + kfree(ref); + } + +out: + if (head) + mutex_unlock(&head->mutex); + btrfs_free_path(path); + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + kfree(ref); + } + while (!list_empty(&prefs_delayed)) { + ref = list_first_entry(&prefs_delayed, struct __prelim_ref, + list); + list_del(&ref->list); + kfree(ref); + } + + return ret; +} + +/* + * Finds all leafs with a reference to the specified combination of bytenr and + * offset. key_list_head will point to a list of corresponding keys (caller must + * free each list element). The leafs will be stored in the leafs ulist, which + * must be freed with ulist_free. + * + * returns 0 on success, <0 on error + */ +static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **leafs) +{ + struct ulist *tmp; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *leafs = ulist_alloc(GFP_NOFS); + if (!*leafs) { + ulist_free(tmp); + return -ENOMEM; + } + + ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); + ulist_free(tmp); + + if (ret < 0 && ret != -ENOENT) { + ulist_free(*leafs); + return ret; + } + + return 0; +} + +/* + * walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. Found roots are added to the roots list. + * + * returns 0 on success, < 0 on error. + */ +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **roots) +{ + struct ulist *tmp; + struct ulist_node *node = NULL; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *roots = ulist_alloc(GFP_NOFS); + if (!*roots) { + ulist_free(tmp); + return -ENOMEM; + } + + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, seq, + tmp, *roots); + if (ret < 0 && ret != -ENOENT) { + ulist_free(tmp); + ulist_free(*roots); + return ret; + } + node = ulist_next(tmp, node); + if (!node) + break; + bytenr = node->val; + } + + ulist_free(tmp); + return 0; +} + + static int __inode_info(u64 inum, u64 ioff, u8 key_type, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 92618837cb8..d00dfa9ca93 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -20,6 +20,7 @@ #define __BTRFS_BACKREF__ #include "ioctl.h" +#include "ulist.h" struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **roots); + struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); -- cgit v1.2.3 From 4692cf58aa7b81f721c1653d48db99ea41421d58 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 2 Dec 2011 14:56:41 +0100 Subject: Btrfs: new backref walking code The old backref iteration code could only safely be used on commit roots. Besides this limitation, it had bugs in finding the roots for these references. This commit replaces large parts of it by btrfs_find_all_roots() which a) really finds all roots and the correct roots, b) works correctly under heavy file system load, c) considers delayed refs. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 354 +++++++++++++++-------------------------------------- fs/btrfs/ioctl.c | 8 +- fs/btrfs/scrub.c | 7 +- 3 files changed, 107 insertions(+), 262 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 03c30a1836f..b9a843226de 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -23,18 +23,6 @@ #include "transaction.h" #include "delayed-ref.h" -struct __data_ref { - struct list_head list; - u64 inum; - u64 root; - u64 extent_data_item_offset; -}; - -struct __shared_ref { - struct list_head list; - u64 disk_byte; -}; - /* * this structure records all encountered refs on the way up to the root */ @@ -964,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type != BTRFS_EXTENT_ITEM_KEY || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) + found_key->objectid + found_key->offset <= logical) { + pr_debug("logical %llu is not within any extent\n", + (unsigned long long)logical); return -ENOENT; + } eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -974,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); + pr_debug("logical %llu is at position %llu within the extent (%llu " + "EXTENT_ITEM %llu) flags %#llx size %u\n", + (unsigned long long)logical, + (unsigned long long)(logical - found_key->objectid), + (unsigned long long)found_key->objectid, + (unsigned long long)found_key->offset, + (unsigned long long)flags, item_size); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return BTRFS_EXTENT_FLAG_TREE_BLOCK; if (flags & BTRFS_EXTENT_FLAG_DATA) @@ -1070,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int __data_list_add(struct list_head *head, u64 inum, - u64 extent_data_item_offset, u64 root) -{ - struct __data_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->inum = inum; - ref->extent_data_item_offset = extent_data_item_offset; - ref->root = root; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, - struct btrfs_extent_data_ref *dref) -{ - return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), - btrfs_extent_data_ref_offset(eb, dref), - btrfs_extent_data_ref_root(eb, dref)); -} - -static int __shared_list_add(struct list_head *head, u64 disk_byte) -{ - struct __shared_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->disk_byte = disk_byte; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, - u64 logical, u64 inum, - u64 extent_data_item_offset, - u64 extent_offset, - struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) -{ - u64 ref_root; - u32 item_size; - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *eiref; - struct __data_ref *ref; - int ret; - int type; - int last; - unsigned long ptr = 0; - - WARN_ON(!list_empty(data_refs)); - ret = extent_from_logical(fs_info, logical, path, &key); - if (ret & BTRFS_EXTENT_FLAG_DATA) - ret = -EIO; - if (ret < 0) - goto out; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - ret = 0; - ref_root = 0; - /* - * as done in iterate_extent_inodes, we first build a list of refs to - * iterate, then free the path and then iterate them to avoid deadlocks. - */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last < 0) { - ret = last; - goto out; - } - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) { - ref_root = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __data_list_add(data_refs, inum, - extent_data_item_offset, - ref_root); - } - } while (!ret && !last); - - btrfs_release_path(path); - - if (ref_root == 0) { - printk(KERN_ERR "btrfs: failed to find tree block ref " - "for shared data backref %llu\n", logical); - WARN_ON(1); - ret = -EIO; - } - -out: - while (!list_empty(data_refs)) { - ref = list_first_entry(data_refs, struct __data_ref, list); - list_del(&ref->list); - if (!ret) - ret = iterate(ref->inum, extent_offset + - ref->extent_data_item_offset, - ref->root, ctx); - kfree(ref); - } - - return ret; -} - -static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, - u64 logical, u64 orig_extent_item_objectid, - u64 extent_offset, struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) +static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 logical, + u64 orig_extent_item_objectid, + u64 extent_item_pos, u64 root, + iterate_extent_inodes_t *iterate, void *ctx) { u64 disk_byte; struct btrfs_key key; @@ -1199,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int slot; int nritems; - int ret; - int found = 0; + int ret = 0; + int extent_type; + u64 data_offset; + u64 data_len; eb = read_tree_block(fs_info->tree_root, logical, fs_info->tree_root->leafsize, 0); @@ -1218,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - if (!fi) { - free_extent_buffer(eb); - return -EIO; - } + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) { - if (found) - break; - else - continue; - } - ++found; - ret = __iter_shared_inline_ref_inodes(fs_info, logical, - key.objectid, - key.offset, - extent_offset, path, - data_refs, - iterate, ctx); - if (ret) - break; - } + if (disk_byte != orig_extent_item_objectid) + continue; - if (!found) { - printk(KERN_ERR "btrfs: failed to follow shared data backref " - "to parent %llu\n", logical); - WARN_ON(1); - ret = -EIO; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + continue; + + pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " + "root %llu\n", orig_extent_item_objectid, + key.objectid, key.offset, root); + ret = iterate(key.objectid, + key.offset + (extent_item_pos - data_offset), + root, ctx); + if (ret) { + pr_debug("stopping iteration because ret=%d\n", ret); + break; + } } free_extent_buffer(eb); + return ret; } /* * calls iterate() for every inode that references the extent identified by - * the given parameters. will use the path given as a parameter and return it - * released. + * the given parameters. * when the iterator function returns a non-zero value, iteration stops. + * path is guaranteed to be in released state when iterate() is called. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u64 extent_item_objectid, - u64 extent_offset, + u64 extent_item_objectid, u64 extent_item_pos, iterate_extent_inodes_t *iterate, void *ctx) { - unsigned long ptr = 0; - int last; int ret; - int type; - u64 logical; - u32 item_size; - struct btrfs_extent_inline_ref *eiref; - struct btrfs_extent_data_ref *dref; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_key key; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct __data_ref *ref_d; - struct __shared_ref *ref_s; + struct btrfs_trans_handle *trans; + struct ulist *refs; + struct ulist *roots; + struct ulist_node *ref_node = NULL; + struct ulist_node *root_node = NULL; + struct seq_list seq_elem; + struct btrfs_delayed_ref_root *delayed_refs; - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - /* first we iterate the inline refs, ... */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last == -ENOENT) { - ret = 0; - break; - } - if (last < 0) { - ret = last; - break; - } + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&eiref->offset); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - logical = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __shared_list_add(&shared_refs, logical); - } - } while (!ret && !last); + pr_debug("resolving all inodes for extent %llu\n", + extent_item_objectid); - /* ... then we proceed to in-tree references and ... */ - while (!ret) { - ++path->slots[0]; - if (path->slots[0] > btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret) { - if (ret == 1) - ret = 0; /* we're done */ - break; - } - eb = path->nodes[0]; - } - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_item_objectid) - break; - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __shared_list_add(&shared_refs, key.offset); - } - } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + btrfs_get_delayed_seq(delayed_refs, &seq_elem); + spin_unlock(&delayed_refs->lock); - btrfs_release_path(path); + ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, + extent_item_pos, seq_elem.seq, + &refs); - /* - * ... only at the very end we can process the refs we found. this is - * because the iterator function we call is allowed to make tree lookups - * and we have to avoid deadlocks. additionally, we need more tree - * lookups ourselves for shared data refs. - */ - while (!list_empty(&data_refs)) { - ref_d = list_first_entry(&data_refs, struct __data_ref, list); - list_del(&ref_d->list); - if (!ret) - ret = iterate(ref_d->inum, extent_offset + - ref_d->extent_data_item_offset, - ref_d->root, ctx); - kfree(ref_d); - } + if (ret) + goto out; - while (!list_empty(&shared_refs)) { - ref_s = list_first_entry(&shared_refs, struct __shared_ref, - list); - list_del(&ref_s->list); - if (!ret) - ret = __iter_shared_inline_ref(fs_info, - ref_s->disk_byte, - extent_item_objectid, - extent_offset, path, - &data_refs, - iterate, ctx); - kfree(ref_s); + while (!ret && (ref_node = ulist_next(refs, ref_node))) { + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, + seq_elem.seq, &roots); + if (ret) + break; + while (!ret && (root_node = ulist_next(roots, root_node))) { + pr_debug("root %llu references leaf %llu\n", + root_node->val, ref_node->val); + ret = iterate_leaf_refs(fs_info, path, ref_node->val, + extent_item_objectid, + extent_item_pos, root_node->val, + iterate, ctx); + } } + ulist_free(refs); + ulist_free(roots); +out: + btrfs_put_delayed_seq(delayed_refs, &seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); return ret; } @@ -1369,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - u64 offset; + u64 extent_item_pos; struct btrfs_key found_key; ret = extent_from_logical(fs_info, logical, path, &found_key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -EINVAL; if (ret < 0) return ret; - offset = logical - found_key.objectid; + extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - offset, iterate, ctx); + extent_item_pos, iterate, ctx); return ret; } @@ -1426,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ + pr_debug("following ref at offset %u for inode %llu in " + "tree %llu\n", cur, + (unsigned long long)found_key.objectid, + (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); if (ret) { free_extent_buffer(eb); @@ -1466,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { + pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { + pr_debug("missed path, not enough space. missing bytes: %lu, " + "constructed so far: %s\n", + (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c48f2e931ea..9b0526872b7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2976,7 +2976,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, { int ret = 0; int size; - u64 extent_offset; + u64 extent_item_pos; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; @@ -3007,15 +3007,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, } ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -ENOENT; if (ret < 0) goto out; - extent_offset = loi->logical - key.objectid; + extent_item_pos = loi->logical - key.objectid; ret = iterate_extent_inodes(root->fs_info, path, key.objectid, - extent_offset, build_ino_list, inodes); + extent_item_pos, build_ino_list, + inodes); if (ret < 0) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c27bcb67f33..b5edff25a53 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -309,7 +309,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, u8 ref_level; unsigned long ptr = 0; const int bufsize = 4096; - u64 extent_offset; + u64 extent_item_pos; path = btrfs_alloc_path(); @@ -329,12 +329,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, if (ret < 0) goto out; - extent_offset = swarn.logical - found_key.objectid; + extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -351,7 +352,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, } else { swarn.path = path; iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_offset, + extent_item_pos, scrub_print_warning_inode, &swarn); } -- cgit v1.2.3 From 6bf7e080d5bcb0d399ee38ce3dabbfad64448192 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 1 Dec 2011 14:35:19 +0100 Subject: Btrfs: make sure we're not using obsolete code in btrfs_get_extent There's code in btrfs_get_extent that should never be used. This patch turns a WARN_ON(1) into a BUG(), hoping we can remove the transaction code from btrfs_get_extent soon. Signed-off-by: Jan Schmidt --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ea819386b86..603d740f0f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5022,7 +5022,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { - WARN_ON(1); + BUG(); if (!trans) { kunmap(page); free_extent_map(em); -- cgit v1.2.3 From 203bf287cb01a5dc26c20bd3737cecf3aeba1d48 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 6 Jan 2012 15:23:57 -0500 Subject: Btrfs: run chunk allocations while we do delayed refs Btrfs tries to batch extent allocation tree changes to improve performance and reduce metadata trashing. But it doesn't allocate new metadata chunks while it is doing allocations for the extent allocation tree. This commit changes the delayed refence code to do chunk allocations if we're getting low on room. It prevents crashes and improves performance. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 14 ++++++++++---- fs/btrfs/transaction.c | 9 +-------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5fbe576d2b..71549d11a09 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2267,9 +2267,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(extent_op); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; + goto next; } list_del_init(&locked_ref->cluster); @@ -2289,7 +2287,11 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(ref); kfree(extent_op); count++; - +next: + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -2317,6 +2319,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3..360c2dfd1ee 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -467,19 +467,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - while (count < 4) { + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; -- cgit v1.2.3 From cf1d72c9ceec391d34c48724da57282e97f01122 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 6 Jan 2012 15:41:34 -0500 Subject: Btrfs: lower the bar for chunk allocation The chunk allocation code has tried to keep a pretty tight lid on creating new metadata chunks. This is partially because in the past the reservation code didn't give us an accurate idea of how much space was being used. The new code is much more accurate, so we're able to get rid of some of these checks. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71549d11a09..247d2c94f8e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3263,27 +3263,12 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - - /* - * we have two similar checks here, one based on percentage - * and once based on a hard number of 256MB. The idea - * is that if we have a good amount of free - * room, don't allocate a chunk. A good mount is - * less than 80% utilized of the chunks we have allocated, - * or more than 256MB free - */ - if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) - return 0; - - if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) - return 0; - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 5% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + /* 256MB or 2% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; return 1; } -- cgit v1.2.3 From 1100373f8aa69e377386499350496e3d8565605f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 6 Jan 2012 15:47:38 -0500 Subject: Btrfs: use bigger metadata chunks on bigger filesystems The 256MB chunk is a little small on a huge FS. This scales up the chunk size. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..ac00e3aa80a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2441,7 +2441,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 8 * 1024 * 1024; -- cgit v1.2.3 From a5f6f719a5cd7caeee8ed8137cf3f94c3bbebc65 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Mon, 12 Dec 2011 04:48:19 -0200 Subject: Btrfs: test free space only for unclustered allocation Since the clustered allocation may be taking extents from a different block group, there's no point in spin-locking and testing the current block group free space before attempting to allocate space from a cluster, even more so when we might refrain from even trying the cluster in the current block group because, after the cluster was set up, not enough free space remained. Furthermore, cluster creation attempts fail fast when the block group doesn't have enough free space, so the test was completely superfluous. I've move the free space test past the cluster allocation attempt, where it is more useful, and arranged for a cluster in the current block group to be released before trying an unclustered allocation, when we reach the LOOP_NO_EMPTY_SIZE stage, so that the free space in the cluster stands a chance of being combined with additional free space in the block group so as to succeed in the allocation attempt. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 247d2c94f8e..5ea3acc5324 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5286,15 +5286,6 @@ alloc: if (unlikely(block_group->ro)) goto loop; - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - /* * Ok we want to try and use the cluster allocator, so * lets look there @@ -5340,8 +5331,15 @@ refill_cluster: * plenty of times and not have found * anything, so we are likely way too * fragmented for the clustering stuff to find - * anything. */ - if (loop >= LOOP_NO_EMPTY_SIZE) { + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + last_ptr->block_group != block_group) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } @@ -5352,6 +5350,11 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5392,6 +5395,15 @@ refill_cluster: } unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* -- cgit v1.2.3 From 34c80b1d93e6e20ca9dea0baf583a5b5510d92d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 21:32:45 -0500 Subject: vfs: switch ->show_options() to struct dentry * Signed-off-by: Al Viro --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dc62d3cc68f..ae488aa1966 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -661,9 +661,9 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } -static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_root *root = btrfs_sb(dentry->d_sb); struct btrfs_fs_info *info = root->fs_info; char *compress_type; -- cgit v1.2.3 From fc7c1077ceb99c35e5f9d0ce03dc7740565bb2bf Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Mon, 28 Nov 2011 12:36:17 -0200 Subject: Btrfs: don't set up allocation result twice We store the allocation start and length twice in ins, once right after the other, but with intervening calls that may prevent the duplicate from being optimized out by the compiler. Remove one of the assignments. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5ea3acc5324..37594e4bf66 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5441,9 +5441,6 @@ checks: goto loop; } - ins->objectid = search_start; - ins->offset = num_bytes; - if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); -- cgit v1.2.3 From 1bb91902dc90e25449893e693ad45605cb08fbe5 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Fri, 14 Oct 2011 12:10:36 -0300 Subject: Btrfs: revamp clustered allocation logic Parameterize clusters on minimum total size, minimum chunk size and minimum contiguous size for at least one chunk, without limits on cluster, window or gap sizes. Don't tolerate any fragmentation for SSD_SPREAD; accept it for metadata, but try to keep data dense. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 112 +++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 63 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ec23d43d0c3..ce40db59c70 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2283,23 +2283,23 @@ out: static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long search_bits; - unsigned long total_bits; + unsigned long want_bits; + unsigned long min_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; int ret; - bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(bytes, block_group->sectorsize); - total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, block_group->sectorsize); + min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -2308,7 +2308,7 @@ again: i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= search_bits) { + if (next_zero - i >= min_bits) { found_bits = next_zero - i; break; } @@ -2318,10 +2318,9 @@ again: if (!found_bits) return -ENOSPC; - if (!found) { + if (!total_found) { start = i; cluster->max_size = 0; - found = true; } total_found += found_bits; @@ -2329,13 +2328,8 @@ again: if (cluster->max_size < found_bits * block_group->sectorsize) cluster->max_size = found_bits * block_group->sectorsize; - if (total_found < total_bits) { - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); - if (i - start > total_bits * 2) { - total_found = 0; - cluster->max_size = 0; - found = false; - } + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; goto again; } @@ -2351,23 +2345,23 @@ again: /* * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; - struct btrfs_free_space *prev = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_start; u64 window_free; u64 max_extent; - u64 max_gap = 128 * 1024; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2377,8 +2371,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * We don't want bitmaps, so just move along until we find a normal * extent entry. */ - while (entry->bitmap) { - if (list_empty(&entry->list)) + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) @@ -2391,12 +2385,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, max_extent = entry->bytes; first = entry; last = entry; - prev = entry; - while (window_free <= min_bytes) { - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { @@ -2405,26 +2396,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, continue; } - /* - * we haven't filled the empty size and the window is - * very large. reset and try again - */ - if (entry->offset - (prev->offset + prev->bytes) > max_gap || - entry->offset - window_start > (min_bytes * 2)) { - first = entry; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) max_extent = entry->bytes; - } else { - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; - } - prev = entry; } + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + cluster->window_start = first->offset; node = &first->offset_index; @@ -2438,7 +2421,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap) + if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); @@ -2460,7 +2443,7 @@ static noinline int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2485,7 +2468,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (entry->bytes < min_bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); + bytes, cont1_bytes, min_bytes); if (!ret) return 0; } @@ -2499,7 +2482,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, /* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise @@ -2515,23 +2498,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; + u64 cont1_bytes; int ret; - /* for metadata, allow allocates with more holes */ + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; + cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } spin_lock(&ctl->tree_lock); @@ -2539,7 +2523,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ - if (ctl->free_space < min_bytes) { + if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } @@ -2553,10 +2537,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, } ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes, min_bytes); + bytes + empty_size, + cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes, min_bytes); + offset, bytes + empty_size, + cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) -- cgit v1.2.3 From 98c7089c769048f941bd5c5285287f8fc301f8b1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:00:31 -0500 Subject: btrfs: preparation to fixing mount/umount race We need fs_info and root to live until the moment when the victim superblock leaves the list, so we need to postpone free_fs_info() until after ->put_super(). The call is buried in close_ctree(), though, so we need to lift it into the callers (including btrfs_put_super()) first. Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/super.c | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a774..dcb5d949b54 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2424,6 +2424,7 @@ retry_root_backup: up_read(&fs_info->cleanup_work_sem); if (err) { close_ctree(tree_root); + free_fs_info(fs_info); return ERR_PTR(err); } } @@ -3059,8 +3060,6 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - free_fs_info(fs_info); - return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ae488aa1966..a3f435e5898 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -151,6 +151,7 @@ static void btrfs_put_super(struct super_block *sb) int ret; ret = close_ctree(root); + free_fs_info(root->fs_info); sb->s_fs_info = NULL; (void)ret; /* FIXME: need to fix VFS to return error? */ @@ -589,6 +590,7 @@ static int btrfs_fill_super(struct super_block *sb, struct inode *inode; struct dentry *root_dentry; struct btrfs_root *tree_root; + struct btrfs_fs_info *fs_info; struct btrfs_key key; int err; @@ -609,12 +611,13 @@ static int btrfs_fill_super(struct super_block *sb, printk("btrfs: open_ctree failed\n"); return PTR_ERR(tree_root); } + fs_info = tree_root->fs_info; sb->s_fs_info = tree_root; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -635,6 +638,7 @@ static int btrfs_fill_super(struct super_block *sb, fail_close: close_ctree(tree_root); + free_fs_info(fs_info); return err; } -- cgit v1.2.3 From aea52e19dd2085617dd7d247afb76a90cf2ea40c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:22:46 -0500 Subject: btrfs: get ->kill_sb() of its own ... and move free_fs_info() to that, out of ->put_super(). Do NOT set ->s_fs_info to NULL in the latter; we need it for sget() to be able to see and wait for fs in the middle of umount if we get a mount/umount race. Signed-off-by: Al Viro --- fs/btrfs/super.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a3f435e5898..eca48624a4f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -147,14 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - int ret; - - ret = close_ctree(root); - free_fs_info(root->fs_info); - sb->s_fs_info = NULL; - - (void)ret; /* FIXME: need to fix VFS to return error? */ + (void)close_ctree(btrfs_sb(sb)); + /* FIXME: need to fix VFS to return error? */ + /* AV: return it _where_? ->put_super() can be triggered by any number + * of async events, up to and including delivery of SIGKILL to the + * last process that kept it busy. Or segfault in the aforementioned + * process... Whom would you report that to? + */ } enum { @@ -1223,11 +1222,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static void btrfs_kill_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = NULL; + if (sb->s_root) + fs_info = btrfs_sb(sb)->fs_info; + kill_anon_super(sb); + if (fs_info) + free_fs_info(fs_info); +} + static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", .mount = btrfs_mount, - .kill_sb = kill_anon_super, + .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; -- cgit v1.2.3 From 6de1d09d9677dce7a04ef485d426821159ab7de9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:29:09 -0500 Subject: btrfs: fix mount/umount race Do *NOT* skip doomed superblocks in btrfs_test_super(); we want sget() to wait for their shutdown to complete. Since we don't mutilate ->s_fs_info in ->put_super() anymore (or free what it used to point to until the superblock is past being findable by sget()), we can just DTRT there and report a match. Signed-off-by: Al Viro --- fs/btrfs/super.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index eca48624a4f..b9fd62a0fca 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -733,20 +733,15 @@ static int btrfs_test_super(struct super_block *s, void *data) struct btrfs_root *test_root = data; struct btrfs_root *root = btrfs_sb(s); - /* - * If this super block is going away, return false as it - * can't match as an existing super block. - */ - if (!atomic_read(&s->s_active)) - return 0; return root->fs_info->fs_devices == test_root->fs_info->fs_devices; } static int btrfs_set_super(struct super_block *s, void *data) { - s->s_fs_info = data; - - return set_anon_super(s, data); + int err = set_anon_super(s, data); + if (!err) + s->s_fs_info = data; + return err; } /* -- cgit v1.2.3 From 10f6327b5d26660e06120ba17cff993cb616b1d2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 15:05:22 -0500 Subject: btrfs: fix a deadlock in btrfs_scan_one_device() pathname resolution under a global mutex, taken on some paths in ->mount() is a Bad Idea(tm) - think what happens if said pathname resolution triggers automount of some btrfs instance and walks into attempt to grab the same mutex. Deadlock - we are waiting for daemon to finish walking the path, daemon is waiting for us to release the mutex... Signed-off-by: Al Viro --- fs/btrfs/volumes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..96bb3c0a79b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -706,8 +706,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 devid; u64 transid; - mutex_lock(&uuid_mutex); - flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); @@ -716,6 +714,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } + mutex_lock(&uuid_mutex); ret = set_blocksize(bdev, 4096); if (ret) goto error_close; @@ -737,9 +736,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: + mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: - mutex_unlock(&uuid_mutex); return ret; } -- cgit v1.2.3 From 2eb34cd368b3e5cef960ae9d9971a037cd6fa9d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 00:32:06 -0500 Subject: btrfs: sanitizing ->fs_info, part 1 take assignment of ->fs_info to callers of __setup_root() Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dcb5d949b54..04eba10af82 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1142,7 +1142,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_item_inserted = 0; root->orphan_cleanup_state = 0; - root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; @@ -1193,6 +1192,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root, u32 blocksize; u64 generation; + root->fs_info = fs_info; __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, root, fs_info, objectid); @@ -1227,6 +1227,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); + root->fs_info = fs_info; __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, root, fs_info, BTRFS_TREE_LOG_OBJECTID); @@ -1330,6 +1331,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, goto out; } + root->fs_info = fs_info; __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, root, fs_info, location->objectid); @@ -2055,6 +2057,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + tree_root->fs_info = fs_info; __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2247,6 +2250,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root_level(disk_super)); generation = btrfs_super_chunk_root_generation(disk_super); + chunk_root->fs_info = fs_info; __setup_root(nodesize, leafsize, sectorsize, stripesize, chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); @@ -2373,6 +2377,7 @@ retry_root_backup: goto fail_trans_kthread; } + log_tree_root->fs_info = fs_info; __setup_root(nodesize, leafsize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); -- cgit v1.2.3 From 1233f546ec29ac32424327baf6ae5df81e3d9eae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 00:34:00 -0500 Subject: btrfs: sanitizing ->fs_info, part 2 lift assignment to callers of find_and_setup_root() Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 04eba10af82..b8dae517f80 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1192,7 +1192,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root, u32 blocksize; u64 generation; - root->fs_info = fs_info; __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, root, fs_info, objectid); @@ -1322,6 +1321,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, if (!root) return ERR_PTR(-ENOMEM); if (location->offset == (u64)-1) { + root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, location->objectid, root); if (ret) { @@ -2300,18 +2300,21 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); + extent_root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) goto recovery_tree_root; extent_root->track_dirty = 1; + dev_root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) goto recovery_tree_root; dev_root->track_dirty = 1; + csum_root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) -- cgit v1.2.3 From 38a77db49ad8f78369dcdfb693b8e5a818a60104 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 00:38:54 -0500 Subject: btrfs: sanitizing ->fs_info, part 3 move assignments to ->fs_info in open_ctree() up, to the place just after the original allocations. Assignment for tree_root becomes a no-op - we'd obtained fs_info from tree_root->fs_info in the first place. Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b8dae517f80..06480fcf0fb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1320,8 +1320,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root = kzalloc(sizeof(*root), GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); + root->fs_info = fs_info; if (location->offset == (u64)-1) { - root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, location->objectid, root); if (ret) { @@ -1331,7 +1331,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, goto out; } - root->fs_info = fs_info; __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, root, fs_info, location->objectid); @@ -1914,6 +1913,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -ENOMEM; goto fail; } + chunk_root->fs_info = fs_info; + extent_root->fs_info = fs_info; + dev_root->fs_info = fs_info; + csum_root->fs_info = fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -2057,7 +2060,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); - tree_root->fs_info = fs_info; __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2250,7 +2252,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root_level(disk_super)); generation = btrfs_super_chunk_root_generation(disk_super); - chunk_root->fs_info = fs_info; __setup_root(nodesize, leafsize, sectorsize, stripesize, chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); @@ -2300,21 +2301,18 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); - extent_root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) goto recovery_tree_root; extent_root->track_dirty = 1; - dev_root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) goto recovery_tree_root; dev_root->track_dirty = 1; - csum_root->fs_info = fs_info; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) -- cgit v1.2.3 From 6f07e42ee6fcc252a210781d7262f4051e9fd8f6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 00:46:16 -0500 Subject: btrfs: sanitizing ->fs_info, part 4 A new helper: btrfs_alloc_root(fs_info); allocates btrfs_root and sets ->fs_info. All places allocating the suckers converted to it. At that point we *never* reassign ->fs_info of btrfs_root; it's set before anyone sees the address of newly allocated struct btrfs_root and never assigned anywhere else. Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 33 +++++++++++++++------------------ fs/btrfs/disk-io.h | 2 ++ fs/btrfs/super.c | 3 +-- 3 files changed, 18 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 06480fcf0fb..ee846ce5884 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1215,6 +1215,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } +struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); + if (root) + root->fs_info = fs_info; + return root; +} + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1222,11 +1230,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); - root->fs_info = fs_info; __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, root, fs_info, BTRFS_TREE_LOG_OBJECTID); @@ -1317,10 +1324,9 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, u32 blocksize; int ret = 0; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); - root->fs_info = fs_info; if (location->offset == (u64)-1) { ret = find_and_setup_root(tree_root, fs_info, location->objectid, root); @@ -1900,23 +1906,15 @@ struct btrfs_root *open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; - extent_root = fs_info->extent_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - csum_root = fs_info->csum_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - chunk_root = fs_info->chunk_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - dev_root = fs_info->dev_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); + csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); + dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - chunk_root->fs_info = fs_info; - extent_root->fs_info = fs_info; - dev_root->fs_info = fs_info; - csum_root->fs_info = fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -2372,13 +2370,12 @@ retry_root_backup: btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { err = -ENOMEM; goto fail_trans_kthread; } - log_tree_root->fs_info = fs_info; __setup_root(nodesize, leafsize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c99d0a8f13f..2bb5f59ddf9 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -86,6 +86,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); +struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info); + #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); void btrfs_set_buffer_lockdep_class(u64 objectid, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b9fd62a0fca..e9f876a1655 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -901,12 +901,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (!fs_info) return ERR_PTR(-ENOMEM); - fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + fs_info->tree_root = btrfs_alloc_root(fs_info); if (!fs_info->tree_root) { error = -ENOMEM; goto error_fs_info; } - fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); -- cgit v1.2.3 From e3029d9fd426c8f582210ba35551ae5506218345 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 00:56:18 -0500 Subject: btrfs: sanitizing ->fs_info, part 5 close_ctree() uses a weird mix of accesses to root->fs_info and its value at the beginning of function stored in local variable. Since ->fs_info *never* changes, let's just use the local variable to avoid confusion. Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ee846ce5884..9308c7f13c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2982,7 +2982,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(root->fs_info); + btrfs_run_defrag_inodes(fs_info); /* * Here come 2 situations when btrfs is broken to flip readonly: @@ -3011,8 +3011,8 @@ int close_ctree(struct btrfs_root *root) btrfs_put_block_group_cache(fs_info); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); + kthread_stop(fs_info->transaction_kthread); + kthread_stop(fs_info->cleaner_kthread); fs_info->closing = 2; smp_mb(); @@ -3030,14 +3030,14 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->extent_root->commit_root); free_extent_buffer(fs_info->tree_root->node); free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(root->fs_info->dev_root->commit_root); - free_extent_buffer(root->fs_info->csum_root->node); - free_extent_buffer(root->fs_info->csum_root->commit_root); - - btrfs_free_block_groups(root->fs_info); + free_extent_buffer(fs_info->chunk_root->node); + free_extent_buffer(fs_info->chunk_root->commit_root); + free_extent_buffer(fs_info->dev_root->node); + free_extent_buffer(fs_info->dev_root->commit_root); + free_extent_buffer(fs_info->csum_root->node); + free_extent_buffer(fs_info->csum_root->commit_root); + + btrfs_free_block_groups(fs_info); del_fs_roots(fs_info); -- cgit v1.2.3 From ad2b2c802be2d3e8ed8364fef5ffaddabe448219 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:10:02 -0500 Subject: btrfs: make open_ctree() return int It returns either ERR_PTR(-ve) or sb->s_fs_info. The latter can be found by caller just as well, TYVM, no need to return it. Just return -ve or 0... Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 12 ++++++------ fs/btrfs/disk-io.h | 6 +++--- fs/btrfs/super.c | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9308c7f13c1..78247522c69 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1880,9 +1880,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) } -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; @@ -2428,11 +2428,11 @@ retry_root_backup: if (err) { close_ctree(tree_root); free_fs_info(fs_info); - return ERR_PTR(err); + return err; } } - return tree_root; + return 0; fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); @@ -2479,7 +2479,7 @@ fail_srcu: fail: btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); - return ERR_PTR(err); + return err; recovery_tree_root: if (!btrfs_test_opt(tree_root, RECOVERY)) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2bb5f59ddf9..6f5f48778b0 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e9f876a1655..56e007fd670 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -604,12 +604,12 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - tree_root = open_ctree(sb, fs_devices, (char *)data); - - if (IS_ERR(tree_root)) { + err = open_ctree(sb, fs_devices, (char *)data); + if (err) { printk("btrfs: open_ctree failed\n"); - return PTR_ERR(tree_root); + return err; } + tree_root = sb->s_fs_info; fs_info = tree_root->fs_info; sb->s_fs_info = tree_root; -- cgit v1.2.3 From 29db78aa0ac82319b764b87a1c5030d74523e296 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:12:38 -0500 Subject: btrfs: kill pointless reassignment of ->s_fs_info in btrfs_fill_super() We do not (fortunately) modify ->s_fs_info of superblock on the fly in btrfs_fill_super(); apparent assignment is a no-op. Signed-off-by: Al Viro --- fs/btrfs/super.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 56e007fd670..a381f9f9b0c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -588,8 +588,8 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_root *tree_root; - struct btrfs_fs_info *fs_info; + struct btrfs_root *tree_root = sb->s_fs_info; + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_key key; int err; @@ -609,9 +609,6 @@ static int btrfs_fill_super(struct super_block *sb, printk("btrfs: open_ctree failed\n"); return err; } - tree_root = sb->s_fs_info; - fs_info = tree_root->fs_info; - sb->s_fs_info = tree_root; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; -- cgit v1.2.3 From be7e0950def403e90b5295ff2192c39967bf2aec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:40:05 -0500 Subject: btrfs: merge free_fs_info() calls on fill_super failures ... all the way up into btrfs_mount(). Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/super.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 78247522c69..b6d11eb4e43 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2427,7 +2427,6 @@ retry_root_backup: up_read(&fs_info->cleanup_work_sem); if (err) { close_ctree(tree_root); - free_fs_info(fs_info); return err; } } @@ -2478,7 +2477,6 @@ fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_close_devices(fs_info->fs_devices); - free_fs_info(fs_info); return err; recovery_tree_root: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a381f9f9b0c..8901b6c8526 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -634,7 +634,6 @@ static int btrfs_fill_super(struct super_block *sb, fail_close: close_ctree(tree_root); - free_fs_info(fs_info); return err; } @@ -947,6 +946,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { + free_fs_info(fs_info); deactivate_locked_super(s); return ERR_PTR(error); } -- cgit v1.2.3 From d22ca7de770e2a683eac000ba4db5a95e2f4c969 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:46:50 -0500 Subject: btrfs: make free_fs_info() call ->kill_sb() unconditional ... and don't bother with it after btrfs_fill_super() failure - ->kill_sb() (unlike ->put_super()) will be called even if we have not got non-NULL ->s_root. Signed-off-by: Al Viro --- fs/btrfs/super.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8901b6c8526..f628a6a79ca 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -946,7 +946,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { - free_fs_info(fs_info); deactivate_locked_super(s); return ERR_PTR(error); } @@ -1215,12 +1214,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) static void btrfs_kill_super(struct super_block *sb) { - struct btrfs_fs_info *fs_info = NULL; - if (sb->s_root) - fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; kill_anon_super(sb); - if (fs_info) - free_fs_info(fs_info); + free_fs_info(fs_info); } static struct file_system_type btrfs_fs_type = { -- cgit v1.2.3 From 59553edf110e5576d91be9dd5bd53d110e0d0290 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 01:56:28 -0500 Subject: btrfs: consolidate failure exits in btrfs_mount() a bit Signed-off-by: Al Viro --- fs/btrfs/super.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f628a6a79ca..15c8ae571e4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -630,6 +630,7 @@ static int btrfs_fill_super(struct super_block *sb, save_mount_options(sb, data); cleancache_init_fs(sb); + sb->s_flags |= MS_ACTIVE; return 0; fail_close: @@ -929,14 +930,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } if (s->s_root) { - if ((flags ^ s->s_flags) & MS_RDONLY) { - deactivate_locked_super(s); - error = -EBUSY; - goto error_close_devices; - } - btrfs_close_devices(fs_devices); free_fs_info(fs_info); + if ((flags ^ s->s_flags) & MS_RDONLY) + error = -EBUSY; } else { char b[BDEVNAME_SIZE]; @@ -945,19 +942,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= MS_ACTIVE; } - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { + root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); + if (IS_ERR(root)) deactivate_locked_super(s); - return root; - } return root; -- cgit v1.2.3 From 815745cf3e46681241ad8025602ffbf2a452d514 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 15:40:49 -0500 Subject: btrfs: let ->s_fs_info point to fs_info, not root... the latter can be obtained from the former (by looking as ->tree_root) just as cheaply as we currently are doing the other way round. Signed-off-by: Al Viro --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 4 +-- fs/btrfs/export.c | 2 +- fs/btrfs/ioctl.c | 7 +++-- fs/btrfs/super.c | 75 +++++++++++++++++++++++++++--------------------------- 5 files changed, 45 insertions(+), 45 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323..dbb999a3827 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2196,7 +2196,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } -static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b6d11eb4e43..f7d8b9ba519 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1894,8 +1894,8 @@ int open_ctree(struct super_block *sb, struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; - struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *extent_root; struct btrfs_root *csum_root; struct btrfs_root *chunk_root; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1b8dc33778f..5f77166fd01 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, u64 root_objectid, u32 generation, int check_generation) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; struct inode *inode; struct btrfs_key key; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5441ff1480f..0c55f8f3809 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -276,14 +276,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) { - struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -312,7 +311,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); - ret = btrfs_trim_fs(root, &range); + ret = btrfs_trim_fs(fs_info->tree_root, &range); if (ret < 0) return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15c8ae571e4..305adf6dc09 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -147,7 +147,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - (void)close_ctree(btrfs_sb(sb)); + (void)close_ctree(btrfs_sb(sb)->tree_root); /* FIXME: need to fix VFS to return error? */ /* AV: return it _where_? ->put_super() can be triggered by any number * of async events, up to and including delivery of SIGKILL to the @@ -500,7 +500,8 @@ out: static struct dentry *get_default_root(struct super_block *sb, u64 subvol_objectid) { - struct btrfs_root *root = sb->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_path *path; @@ -530,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -544,7 +545,7 @@ static struct dentry *get_default_root(struct super_block *sb, */ btrfs_free_path(path); dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = root->fs_info->fs_root; + new_root = fs_info->fs_root; goto setup_root; } @@ -552,7 +553,7 @@ static struct dentry *get_default_root(struct super_block *sb, btrfs_free_path(path); find_root: - new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + new_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(new_root)) return ERR_CAST(new_root); @@ -588,8 +589,7 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_root *tree_root = sb->s_fs_info; - struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_key key; int err; @@ -634,20 +634,21 @@ static int btrfs_fill_super(struct super_block *sb, return 0; fail_close: - close_ctree(tree_root); + close_ctree(fs_info->tree_root); return err; } int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; int ret; trace_btrfs_sync_fs(wait); if (!wait) { - filemap_flush(root->fs_info->btree_inode->i_mapping); + filemap_flush(fs_info->btree_inode->i_mapping); return 0; } @@ -663,8 +664,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_fs_info *info = root->fs_info; + struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = info->tree_root; char *compress_type; if (btrfs_test_opt(root, DEGRADED)) @@ -727,10 +728,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_root *test_root = data; - struct btrfs_root *root = btrfs_sb(s); + struct btrfs_fs_info *p = data; + struct btrfs_fs_info *fs_info = btrfs_sb(s); - return root->fs_info->fs_devices == test_root->fs_info->fs_devices; + return fs_info->fs_devices == p->fs_devices; } static int btrfs_set_super(struct super_block *s, void *data) @@ -922,8 +923,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, - fs_info->tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -939,7 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - btrfs_sb(s)->fs_info->bdev_holder = fs_type; + btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } @@ -959,7 +959,8 @@ error_fs_info: static int btrfs_remount(struct super_block *sb, int *flags, char *data) { - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; int ret; ret = btrfs_parse_options(root, data); @@ -975,13 +976,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_commit_super(root); WARN_ON(ret); } else { - if (root->fs_info->fs_devices->rw_devices == 0) + if (fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_fs_roots(root->fs_info); + ret = btrfs_cleanup_fs_roots(fs_info); WARN_ON(ret); /* recover relocation */ @@ -1150,18 +1151,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = root->fs_info->super_copy; - struct list_head *head = &root->fs_info->space_info; + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)root->fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fsid; int ret; /* holding chunk_muext to avoid allocating new chunks */ - mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1180,14 +1181,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bavail = total_free_data; - ret = btrfs_calc_avail_data_space(root, &total_free_data); + ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { - mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } buf->f_bavail += total_free_data; buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -1203,7 +1204,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) static void btrfs_kill_super(struct super_block *sb) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); kill_anon_super(sb); free_fs_info(fs_info); } @@ -1246,17 +1247,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->cleaner_mutex); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + mutex_lock(&fs_info->transaction_kthread_mutex); + mutex_lock(&fs_info->cleaner_mutex); return 0; } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_unlock(&root->fs_info->cleaner_mutex); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + mutex_unlock(&fs_info->cleaner_mutex); + mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } -- cgit v1.2.3 From f84a8bd60e3ee49eacc9ba824babf149ba3dad7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 15:57:57 -0500 Subject: btrfs: take allocation of ->tree_root into open_ctree() now that we don't need it for sget() anymore... Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 8 +++++--- fs/btrfs/disk-io.h | 2 -- fs/btrfs/super.c | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f7d8b9ba519..dd71be87593 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1215,7 +1215,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } -struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); if (root) @@ -1895,7 +1895,7 @@ int open_ctree(struct super_block *sb, struct buffer_head *bh; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *tree_root; struct btrfs_root *extent_root; struct btrfs_root *csum_root; struct btrfs_root *chunk_root; @@ -1906,12 +1906,14 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - if (!extent_root || !csum_root || !chunk_root || !dev_root) { + if (!tree_root || !extent_root || !csum_root || + !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 6f5f48778b0..e4bc4741319 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -86,8 +86,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); void btrfs_set_buffer_lockdep_class(u64 objectid, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 305adf6dc09..044652ee7ef 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -899,11 +899,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (!fs_info) return ERR_PTR(-ENOMEM); - fs_info->tree_root = btrfs_alloc_root(fs_info); - if (!fs_info->tree_root) { - error = -ENOMEM; - goto error_fs_info; - } fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); -- cgit v1.2.3 From e3a41a5ba9c2ab988b9f1442925109dca2382fd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:55 -0800 Subject: btrfs: pass __GFP_WRITE for buffered write page allocations Tell the page allocator that pages allocated for a buffered write are expected to become dirty soon. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Minchan Kim Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe939c05..20375e6691c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask); + mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; err = -ENOMEM; -- cgit v1.2.3 From db804f23a72bada58f083dfad6a65d019ddb3bd4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 10 Jan 2012 16:41:01 +0800 Subject: Btrfs: add pinned extents to on-disk free space cache correctly I got this while running xfstests: [24256.836098] block group 317849600 has an wrong amount of free space [24256.836100] btrfs: failed to load free space cache for block group 317849600 We should clamp the extent returned by find_first_extent_bit(), so the start of the extent won't smaller than the start of the block group. Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ec23d43d0c3..01840ef95a3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -838,7 +838,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; - u64 start, end, len; + u64 start, extent_start, extent_end, len; int entries = 0; int bitmaps = 0; int ret; @@ -857,25 +857,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_cluster, block_group_list); - /* - * We shouldn't have switched the pinned extents yet so this is the - * right one - */ - unpin = root->fs_info->pinned_extents; - /* Lock all pages first so we can lock the extent safely. */ io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); - /* - * When searching for pinned extents, we need to start at our start - * offset. - */ - if (block_group) - start = block_group->key.objectid; - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); @@ -918,9 +905,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * We want to add any pinned extents to our free space cache * so we don't leak the space */ + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + if (block_group) + start = block_group->key.objectid; + while (block_group && (start < block_group->key.objectid + block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, + ret = find_first_extent_bit(unpin, start, + &extent_start, &extent_end, EXTENT_DIRTY); if (ret) { ret = 0; @@ -928,20 +926,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + + if (extent_start >= block_group->key.objectid + block_group->key.offset) break; - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); + extent_start = max(extent_start, start); + extent_end = min(block_group->key.objectid + + block_group->key.offset, extent_end + 1); + len = extent_end - extent_start; entries++; - ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); if (ret) goto out_nospc; - start = end + 1; + start = extent_end; } /* Write out the bitmaps */ -- cgit v1.2.3 From a1ee5a45818acc7f9c13e560827cf3e8735ac919 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jan 2012 14:27:42 +0800 Subject: Btrfs: avoid possible NULL deref in io_ctl_drop_pages() If we run into some failure path in io_ctl_prepare_pages(), io_ctl->pages[] array may have some NULL pointers. Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 01840ef95a3..4e55af333e1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { - ClearPageChecked(io_ctl->pages[i]); - unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); + if (io_ctl->pages[i]) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } } } -- cgit v1.2.3 From 706efc6630c2722602541a6a2fc5900a4e38456a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Jan 2012 14:36:28 +0800 Subject: Btrfs: check the return value of io_ctl_init() It can return -ENOMEM. Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4e55af333e1..e4eb222147c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -637,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return ret; + ret = readahead_cache(inode); if (ret) goto out; @@ -851,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (!i_size_read(inode)) return -1; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return -1; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) -- cgit v1.2.3 From f062abf089ff705e09bbaa6fa1e2fd7688a0f2ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Dec 2011 13:36:45 +0800 Subject: Btrfs: remove BUG_ON()s in btrfs_ioctl_setflags() We can recover from errors and return -errno to user space. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c7d5b..9619fb03a5d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_trans_handle *trans; unsigned int flags, oldflags; int ret; + u64 ip_oldflags; + unsigned int i_oldflags; if (btrfs_root_readonly(root)) return -EROFS; @@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) mutex_lock(&inode->i_mutex); + ip_oldflags = ip->flags; + i_oldflags = inode->i_flags; + flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -250,18 +255,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop; + } btrfs_update_iflags(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); btrfs_end_transaction(trans, root); + out_drop: + if (ret) { + ip->flags = ip_oldflags; + inode->i_flags = i_oldflags; + } mnt_drop_write(file->f_path.mnt); - - ret = 0; out_unlock: mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.2.3 From 4da6f1a332f6c16b6594c7892f13c31459b9b1c8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Dec 2011 13:39:50 +0800 Subject: Btrfs: reserve metadata space in btrfs_ioctl_setflags() Check and reserve space for btrfs_update_inode(). Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9619fb03a5d..fe8a60c865e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -254,7 +254,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root); + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_drop; -- cgit v1.2.3 From 125ccb0ae6806dbec31abf4a85448971df3b4e39 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Dec 2011 15:07:24 +0800 Subject: Btrfs: don't pass a trans handle unnecessarily in volumes.c Some functions never use the transaction handle passed to them. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/volumes.c | 18 +++++++----------- fs/btrfs/volumes.h | 3 +-- 3 files changed, 9 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8603ee4e3df..5b53479ce07 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7084,7 +7084,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(NULL, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..73f673c8d8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -829,7 +829,6 @@ out: /* * find_free_dev_extent - find free space in the specified device - * @trans: transaction handler * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @start: store the start of the free space. @@ -848,8 +847,7 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -893,7 +891,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1469,8 +1467,7 @@ error_undo: /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; @@ -1695,7 +1692,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(trans, root); + ret = btrfs_prepare_sprout(root); BUG_ON(ret); } @@ -2323,8 +2320,7 @@ done: return ret; } -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { @@ -2496,7 +2492,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -2687,7 +2683,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(ret); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); BUG_ON(ret); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37..c1701ec9d49 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -230,7 +230,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_root *dev_root); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); #endif -- cgit v1.2.3 From de11cc12df17337979e0929d2831887432f236ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Dec 2011 12:55:47 +0800 Subject: Btrfs: don't pre-allocate btrfs bio We pre-allocate a btrfs bio with fixed size, and then may re-allocate memory if we find stripes are bigger than the fixed size. But this pre-allocation is not necessary. Also we don't have to calcuate the stripe number twice. Signed-off-by: Li Zefan --- fs/btrfs/volumes.c | 67 +++++++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 46 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73f673c8d8d..540fdd25fb5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2897,26 +2897,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; - int stripes_allocated = 8; - int stripes_required = 1; int stripe_index; int i; + int ret = 0; int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; - if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) - stripes_allocated = 1; -again: - if (bbio_ret) { - bbio = kzalloc(btrfs_bio_size(stripes_allocated), - GFP_NOFS); - if (!bbio) - return -ENOMEM; - - atomic_set(&bbio->error, 0); - } - read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); @@ -2935,32 +2922,6 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our btrfs_bio struct is too small, back off and try again */ - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - stripes_required = map->num_stripes; - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripes_required = map->sub_stripes; - max_errors = 1; - } - } - if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { - stripes_required = map->num_stripes; - } - } - if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; - free_extent_map(em); - kfree(bbio); - goto again; - } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -3055,6 +3016,13 @@ again: } BUG_ON(stripe_index >= map->num_stripes); + bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + atomic_set(&bbio->error, 0); + if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = @@ -3151,15 +3119,22 @@ again: stripe_index++; } } - if (bbio_ret) { - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } } + + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; out: free_extent_map(em); - return 0; + return ret; } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, -- cgit v1.2.3 From ec9ef7a13be4dcce964c8503e8999087945e5b9e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Dec 2011 14:06:42 +0800 Subject: Btrfs: simplfy calculation of stripe length for discard operation For btrfs raid, while discarding a range of space, we'll need to know the start offset and length to discard for each device, and it's done in btrfs_map_block(). However the calculation is a bit complex for raid0 and raid10, so I reimplement it based on a fact that: dev1 dev2 dev3 (raid0) ----------------------------------- s0 s3 s6 s1 s4 s7 s2 s5 Each device has (total_stripes / nr_dev) stripes, or plus one. Signed-off-by: Li Zefan --- fs/btrfs/volumes.c | 95 ++++++++++++++++++------------------------------------ 1 file changed, 31 insertions(+), 64 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 540fdd25fb5..563ef650850 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3024,80 +3024,47 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, atomic_set(&bbio->error, 0); if (rw & REQ_DISCARD) { + int factor = 0; + int sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + } + for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; bbio->stripes[i].dev = map->stripes[stripe_index].dev; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - u64 stripes; - u32 last_stripe = 0; - int j; - - div_u64_rem(stripe_nr_end - 1, - map->num_stripes, - &last_stripe); - - for (j = 0; j < map->num_stripes; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - map->num_stripes, &test); - if (test == stripe_index) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, map->num_stripes); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i == 0) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - stripe_offset = 0; - } - if (stripe_index == last_stripe) - bbio->stripes[i].length -= - stripe_end_offset; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u64 stripes; - int j; - int factor = map->num_stripes / - map->sub_stripes; - u32 last_stripe = 0; - - div_u64_rem(stripe_nr_end - 1, - factor, &last_stripe); - last_stripe *= map->sub_stripes; - - for (j = 0; j < factor; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - factor, &test); - - if (test == - stripe_index / map->sub_stripes) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, factor); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i < map->sub_stripes) { - bbio->stripes[i].length -= - stripe_offset; - if (i == map->sub_stripes - 1) - stripe_offset = 0; - } - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - map->sub_stripes - 1)) { + if ((i / sub_stripes + 1) % + sub_stripes == remaining_stripes) bbio->stripes[i].length -= stripe_end_offset; - } + if (i == sub_stripes - 1) + stripe_offset = 0; } else bbio->stripes[i].length = *length; -- cgit v1.2.3 From 7fe1e641502616220437079258506196bc4d8cbf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 29 Dec 2011 14:47:27 +0800 Subject: Btrfs: rewrite btrfs_trim_block_group() There are various bugs in block group trimming: - It may trim from offset smaller than user-specified offset. - It may trim beyond user-specified range. - It may leak free space for extents smaller than specified minlen. - It may truncate the last trimmed extent thus leak free space. - With mixed extents+bitmaps, some extents may not be trimmed. - With mixed extents+bitmaps, some bitmaps may not be trimmed (even none will be trimmed). Even for those trimmed, not all the free space in the bitmaps will be trimmed. I rewrite btrfs_trim_block_group() and break it into two functions. One is to trim extents only, and the other is to trim bitmaps only. Before patching: # fstrim -v /mnt/ /mnt/: 1496465408 bytes were trimmed After patching: # fstrim -v /mnt/ /mnt/: 2193768448 bytes were trimmed And this matches the total free space: # btrfs fi df /mnt Data: total=3.58GB, used=1.79GB System, DUP: total=8.00MB, used=4.00KB System: total=4.00MB, used=0.00 Metadata, DUP: total=205.12MB, used=97.14MB Metadata: total=8.00MB, used=0.00 Signed-off-by: Li Zefan --- fs/btrfs/free-space-cache.c | 235 +++++++++++++++++++++++++++++++------------- 1 file changed, 164 insertions(+), 71 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e4eb222147c..b3cbb8939fa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2594,17 +2594,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +static int do_trimming(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes) { - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry = NULL; + struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 bytes = 0; - u64 actually_trimmed; - int ret = 0; + int ret; + int update = 0; + u64 trimmed = 0; - *trimmed = 0; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); + if (!ret) + *total_trimmed += trimmed; + + btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + u64 bytes; while (start < end) { spin_lock(&ctl->tree_lock); @@ -2615,81 +2655,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, } entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) - entry = tree_search_offset(ctl, - offset_to_bitmap(ctl, start), - 1, 1); - - if (!entry || entry->offset >= end) { + if (!entry) { spin_unlock(&ctl->tree_lock); break; } - if (entry->bitmap) { - ret = search_bitmap(ctl, entry, &start, &bytes); - if (!ret) { - if (start >= end) { - spin_unlock(&ctl->tree_lock); - break; - } - bytes = min(bytes, end - start); - bitmap_clear_bits(ctl, entry, start, bytes); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } else { - start = entry->offset + BITS_PER_BITMAP * - block_group->sectorsize; + /* skip bitmaps */ + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) { spin_unlock(&ctl->tree_lock); - ret = 0; - continue; + goto out; } - } else { - start = entry->offset; - bytes = min(entry->bytes, end - start); - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + } + + if (entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + break; + } + + extent_start = entry->offset; + extent_bytes = entry->bytes; + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; } + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + spin_unlock(&ctl->tree_lock); - if (bytes >= minlen) { - struct btrfs_space_info *space_info; - int update = 0; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (!block_group->ro) { - block_group->reserved += bytes; - space_info->bytes_reserved += bytes; - update = 1; - } - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, - bytes, - &actually_trimmed); - - btrfs_add_free_space(block_group, start, bytes); - if (update) { - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += bytes; - block_group->reserved -= bytes; - space_info->bytes_reserved -= bytes; - spin_unlock(&space_info->lock); - spin_unlock(&block_group->lock); - } + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes); + if (ret) + break; +next: + start += bytes; - if (ret) - break; - *trimmed += actually_trimmed; + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } +out: + return ret; +} + +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); + + while (offset < end) { + bool next_bitmap = false; + + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + break; + } + + entry = tree_search_offset(ctl, offset, 1, 0); + if (!entry) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes); + if (ret2 || start >= end) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = min(bytes, end - start); + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes); + if (ret) + break; +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; + } else { + start += bytes; + if (start >= offset + BITS_PER_BITMAP * ctl->unit) + offset += BITS_PER_BITMAP * ctl->unit; } - start += bytes; - bytes = 0; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -2702,6 +2779,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, return ret; } +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + return ret; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + + return ret; +} + /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. -- cgit v1.2.3 From c7c144db531fda414e532adac56e965ce332e2a5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 10:39:22 +0800 Subject: Btrfs: update global block_rsv when creating a new block group A bug was triggered while using seed device: # mkfs.btrfs /dev/loop1 # btrfstune -S 1 /dev/loop1 # mount -o /dev/loop1 /mnt # btrfs dev add /dev/loop2 /mnt btrfs: block rsv returned -28 ------------[ cut here ]------------ WARNING: at fs/btrfs/extent-tree.c:5969 btrfs_alloc_free_block+0x166/0x396 [btrfs]() ... Call Trace: ... [] btrfs_cow_block+0x101/0x147 [btrfs] [] btrfs_search_slot+0x1b8/0x55f [btrfs] [] btrfs_insert_empty_items+0x42/0x7f [btrfs] [] btrfs_insert_item+0x40/0x7e [btrfs] [] btrfs_make_block_group+0x243/0x2aa [btrfs] [] __btrfs_alloc_chunk+0x672/0x70e [btrfs] [] init_first_rw_device+0x77/0x13c [btrfs] [] btrfs_init_new_device+0x664/0x9fd [btrfs] [] btrfs_ioctl+0x694/0xdbe [btrfs] [] do_vfs_ioctl+0x496/0x4cc [] sys_ioctl+0x33/0x4f [] sysenter_do_call+0x12/0x38 ---[ end trace 906adac595facc7d ]--- Since seed device is readonly, there's no usable space in the filesystem. Afterwards we add a sprout device to it, and the kernel creates a METADATA block group and a SYSTEM block group where comes free space we can reserve, but we still get revervation failure because the global block_rsv hasn't been updated accordingly. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b53479ce07..bf30f670cda 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7446,6 +7446,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; -- cgit v1.2.3 From b367e47fb3a70f5d24ebd6faf7d42436d485fb2d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 11:38:24 +0800 Subject: Btrfs: fix possible deadlock when opening a seed device The correct lock order is uuid_mutex -> volume_mutex -> chunk_mutex, but when we mount a filesystem which has backing seed devices, we have this lock chain: open_ctree() lock(chunk_mutex); read_chunk_tree(); read_one_dev(); open_seed_devices(); lock(uuid_mutex); and then we hit a lockdep splat. Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/volumes.c | 9 +++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e58..858ab347413 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2270,9 +2270,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); - mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 563ef650850..fbb493b28d5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3506,7 +3506,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - mutex_lock(&uuid_mutex); + BUG_ON(!mutex_is_locked(&uuid_mutex)); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -3544,7 +3544,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - mutex_unlock(&uuid_mutex); return ret; } @@ -3687,6 +3686,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; + mutex_lock(&uuid_mutex); + lock_chunks(root); + /* first we search for all of the device items, and then we * read in all of the chunk items. This way we can create chunk * mappings that reference all of the devices that are afound @@ -3737,6 +3739,9 @@ again: } ret = 0; error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + btrfs_free_path(path); return ret; } -- cgit v1.2.3 From b969c4ab9f182a6e1b2a0848be349f99714947b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:34 -0800 Subject: mm: compaction: determine if dirty pages can be migrated without blocking within ->migratepage Asynchronous compaction is used when allocating transparent hugepages to avoid blocking for long periods of time. Due to reports of stalling, there was a debate on disabling synchronous compaction but this severely impacted allocation success rates. Part of the reason was that many dirty pages are skipped in asynchronous compaction by the following check; if (PageDirty(page) && !sync && mapping->a_ops->migratepage != migrate_page) rc = -EBUSY; This skips over all mapping aops using buffer_migrate_page() even though it is possible to migrate some of these pages without blocking. This patch updates the ->migratepage callback with a "sync" parameter. It is the responsibility of the callback to fail gracefully if migration would block. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a774..1375494c8cb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { /* * we can't safely write a btree page from here, @@ -887,7 +887,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } #endif -- cgit v1.2.3 From a6bc32b899223a877f595ef9ddc1e89ead5072b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:43 -0800 Subject: mm: compaction: introduce sync-light migration for use by compaction This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT mode that avoids writing back pages to backing storage. Async compaction maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT. For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is used. This avoids sync compaction stalling for an excessive length of time, particularly when copying files to a USB stick where there might be a large number of dirty pages backed by a filesystem that does not support ->writepages. [aarcange@redhat.com: This patch is heavily based on Andrea's work] [akpm@linux-foundation.org: fix fs/nfs/write.c build] [akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1375494c8cb..d8525662ca7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } #endif -- cgit v1.2.3 From 6fef8df1dcb9b586268caff66df1d71ce8610132 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: get rid of *_alloc_profile fields {data,metadata,system}_alloc_profile fields have been unused for a long time now. Get rid of them. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 3 --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/extent-tree.c | 10 ++++------ fs/btrfs/volumes.c | 6 ++---- 4 files changed, 6 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323..f5434ad49b9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1135,9 +1135,6 @@ struct btrfs_fs_info { u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - u64 data_alloc_profile; - u64 metadata_alloc_profile; - u64 system_alloc_profile; unsigned data_chunk_allocations; unsigned metadata_ratio; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e58..ce9d0fb3627 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2321,9 +2321,6 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; ret = btrfs_init_space_info(fs_info); if (ret) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8603ee4e3df..f0591fd6624 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3067,14 +3067,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits & - root->fs_info->data_alloc_profile; + flags |= root->fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits & - root->fs_info->system_alloc_profile; + flags |= root->fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits & - root->fs_info->metadata_alloc_profile; + flags |= root->fs_info->avail_metadata_alloc_bits; + return btrfs_reduce_alloc_profile(root, flags); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..89096f6d6fb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2752,8 +2752,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - (fs_info->metadata_alloc_profile & - fs_info->avail_metadata_alloc_bits); + fs_info->avail_metadata_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, @@ -2763,8 +2762,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - (fs_info->system_alloc_profile & - fs_info->avail_system_alloc_bits); + fs_info->avail_system_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, -- cgit v1.2.3 From 52ba692972532f8d652080214b6599ece3dd51b9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: introduce masks for chunk type and profile Chunk's type and profile are encoded in u64 flags field. Introduce masks to easily access them. Also fix the type of BTRFS_BLOCK_GROUP_* constants, it should be ULL. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 26 +++++++++++++++++--------- fs/btrfs/extent-tree.c | 12 +++--------- fs/btrfs/volumes.c | 11 ++--------- 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f5434ad49b9..4370a56fe81 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -751,15 +751,23 @@ struct btrfs_csum_item { } __attribute__ ((__packed__)); /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1 << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) -#define BTRFS_BLOCK_GROUP_DUP (1 << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) -#define BTRFS_NR_RAID_TYPES 5 - +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_NR_RAID_TYPES 5 + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f0591fd6624..a8d8204188d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; - flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA; + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -2993,9 +2992,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); - found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | - BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA); + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; found->total_bytes = total_bytes; found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; @@ -3016,10 +3013,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP); + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; if (extra_flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89096f6d6fb..d5fdee56777 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2949,12 +2949,8 @@ again: } } if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) stripes_required = map->num_stripes; - } } if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { @@ -2978,10 +2974,7 @@ again: if (rw & REQ_DISCARD) *length = min_t(u64, em->len - offset, *length); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); -- cgit v1.2.3 From a46d11a8b06dd0431a3888fbc4856ea13a8e634f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: add BTRFS_AVAIL_ALLOC_BIT_SINGLE bit Right now on-disk BTRFS_BLOCK_GROUP_* profile bits are used for avail_{data,metadata,system}_alloc_bits fields, which gather info about available allocation profiles in the FS. When chunk is created or read from disk, its profile is OR'ed with the corresponding avail_alloc_bits field. Since SINGLE is denoted by 0 in the on-disk format, currently there is no way to tell when such chunks become avaialble. Restriper needs that information, so add a separate bit for SINGLE profile. This bit is going to be in-memory only, it should never be written out to disk, so it's not a disk format change. However to avoid remappings in future, reserve corresponding on-disk bit. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 15 +++++++++++++++ fs/btrfs/extent-tree.c | 30 +++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4370a56fe81..3f8f11e18b5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -758,6 +758,7 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE #define BTRFS_NR_RAID_TYPES 5 #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ @@ -768,6 +769,15 @@ struct btrfs_csum_item { BTRFS_BLOCK_GROUP_RAID1 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) + struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; @@ -1140,6 +1150,11 @@ struct btrfs_fs_info { spinlock_t ref_cache_lock; u64 total_ref_cache_size; + /* + * these three are in extended format (availability of single + * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other + * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) + */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8d8204188d..15a22949da1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3014,16 +3014,24 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - if (extra_flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - } + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; } +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format. + */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* @@ -3053,8 +3061,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) if ((flags & BTRFS_BLOCK_GROUP_RAID0) && ((flags & BTRFS_BLOCK_GROUP_RAID1) | (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) + (flags & BTRFS_BLOCK_GROUP_DUP))) { flags &= ~BTRFS_BLOCK_GROUP_RAID0; + } + + /* extended -> chunk profile */ + flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; } -- cgit v1.2.3 From 10ea00f55a07f8f9536d9112b95108a86f700bab Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: make avail_*_alloc_bits fields dynamic Currently when new chunks are created respective avail_alloc_bits field is updated to reflect profiles of all chunks present in the system. However when chunks are removed profile bits are never cleared. This patch clears profile bit of respective avail_alloc_bits field when the last chunk with that profile is removed. Restriper needs this to properly operate when "downgrading". Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 15a22949da1..946b0671c5d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7469,6 +7469,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return 0; } +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start) { @@ -7479,6 +7495,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_key key; struct inode *inode; int ret; + int index; int factor; root = root->fs_info->extent_root; @@ -7494,6 +7511,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, block_group); memcpy(&key, &block_group->key, sizeof(key)); + index = get_block_group_index(block_group); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -7568,6 +7586,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) + clear_avail_alloc_bits(root->fs_info, block_group->flags); up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) -- cgit v1.2.3 From c9e9f97bdfb64d06e9520f8e4f37674ac21cc9bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: add basic restriper infrastructure Add basic restriper infrastructure: extended balancing ioctl and all related ioctl data structures, add data structure for tracking restriper's state to fs_info, etc. The semantics of the old balancing ioctl are fully preserved. Explicitly disallow any volume operations when balance is in progress. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 6 +++ fs/btrfs/disk-io.c | 4 ++ fs/btrfs/ioctl.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/ioctl.h | 43 +++++++++++++++++ fs/btrfs/volumes.c | 120 +++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.h | 14 +++++- 6 files changed, 281 insertions(+), 41 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3f8f11e18b5..c4d98c8df5c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -934,6 +934,7 @@ struct btrfs_block_group_cache { struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_balance_control; struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -1159,6 +1160,11 @@ struct btrfs_fs_info { u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + struct btrfs_balance_control *balance_ctl; + unsigned data_chunk_allocations; unsigned metadata_ratio; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ce9d0fb3627..190a1b24c90 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2002,6 +2002,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + fs_info->balance_ctl = NULL; + sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c7d5b..d838d2cfb94 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1203,13 +1203,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -1226,7 +1234,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; - goto out_unlock; + goto out_free; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; @@ -1241,7 +1249,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; - goto out_unlock; + goto out_free; } } @@ -1250,7 +1258,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_unlock; + goto out_free; } new_size = old_size - new_size; } else if (mod > 0) { @@ -1259,11 +1267,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_unlock; + goto out_free; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_unlock; + goto out_free; } do_div(new_size, root->sectorsize); @@ -1276,7 +1284,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_unlock; + goto out_free; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); @@ -1284,9 +1292,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = btrfs_shrink_device(device, new_size); } -out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); +out_free: kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2052,14 +2061,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2074,14 +2094,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -3034,6 +3065,76 @@ out: return ret; } +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); +} + +static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out; + } + } else { + bargs = NULL; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } + + ret = btrfs_balance(bctl, bargs); + /* + * bctl is freed in __cancel_balance + */ + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3078,7 +3179,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(root, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3110,6 +3211,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae9915de..c8b37d2c0d7 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,47 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* + * this is packed, because it should be exactly the same as its disk + * byte order counterpart (struct btrfs_disk_balance_args) + */ +struct btrfs_balance_args { + __u64 profiles; + __u64 usage; + __u64 devid; + __u64 pstart; + __u64 pend; + __u64 vstart; + __u64 vend; + + __u64 target; + + __u64 flags; + + __u64 unused[8]; +} __attribute__ ((__packed__)); + +/* report balance progress to userspace */ +struct btrfs_balance_progress { + __u64 expected; /* estimated # of chunks that will be + * relocated to fulfill the request */ + __u64 considered; /* # of chunks we have considered so far */ + __u64 completed; /* # of chunks relocated so far */ +}; + +struct btrfs_ioctl_balance_args { + __u64 flags; /* in/out */ + __u64 state; /* out */ + + struct btrfs_balance_args data; /* in/out */ + struct btrfs_balance_args meta; /* in/out */ + struct btrfs_balance_args sys; /* in/out */ + + struct btrfs_balance_progress stat; /* out */ + + __u64 unused[72]; /* pad to 1k */ +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -272,6 +313,8 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d5fdee56777..9fc06e6bc51 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1282,7 +1282,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bool clear_super = false; mutex_lock(&uuid_mutex); - mutex_lock(&root->fs_info->volume_mutex); all_avail = root->fs_info->avail_data_alloc_bits | root->fs_info->avail_system_alloc_bits | @@ -1452,7 +1451,6 @@ error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: - mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -1629,7 +1627,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; /* @@ -1757,8 +1754,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); BUG_ON(ret); } -out: - mutex_unlock(&root->fs_info->volume_mutex); + return ret; error: blkdev_put(bdev, FMODE_EXCL); @@ -1766,7 +1762,7 @@ error: mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - goto out; + return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2077,6 +2073,35 @@ error: return ret; } +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2086,29 +2111,23 @@ static u64 div_factor(u64 num, int factor) return num; } -int btrfs_balance(struct btrfs_root *dev_root) +static int __btrfs_balance(struct btrfs_fs_info *fs_info) { - int ret; - struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; - struct btrfs_trans_handle *trans; struct btrfs_key found_key; - - if (dev_root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&dev_root->fs_info->volume_mutex); - dev_root = dev_root->fs_info->dev_root; + struct btrfs_trans_handle *trans; + int ret; + int enospc_errors = 0; /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); @@ -2151,12 +2170,14 @@ int btrfs_balance(struct btrfs_root *dev_root) * failed */ if (ret == 0) - break; + BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) + if (ret) { + ret = 0; break; + } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -2174,12 +2195,63 @@ int btrfs_balance(struct btrfs_root *dev_root) found_key.offset); if (ret && ret != -ENOSPC) goto error; + if (ret == -ENOSPC) + enospc_errors++; key.offset = found_key.offset - 1; } - ret = 0; + error: btrfs_free_path(path); - mutex_unlock(&dev_root->fs_info->volume_mutex); + if (enospc_errors) { + printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + + return ret; +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + unset_balance_control(fs_info); +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret; + + if (btrfs_fs_closing(fs_info)) { + ret = -EINVAL; + goto out; + } + + set_balance_control(bctl); + + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, bargs); + } + + __cancel_balance(fs_info); + + return ret; +out: + kfree(bctl); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37..88258238528 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,17 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +struct btrfs_balance_args; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; +}; + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); @@ -228,7 +239,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit v1.2.3 From f43ffb60fd94e98be02780944e182ade6653b916 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: add basic infrastructure for selective balancing This allows to have a separate set of filters for each chunk type (data,meta,sys). The code however is generic and switch on chunk type is only done once. This commit also adds a type filter: it allows to balance for example meta and system chunks w/o touching data ones. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 3 +++ fs/btrfs/volumes.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 11 ++++++++++ 3 files changed, 71 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d838d2cfb94..29b3a94933f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3116,6 +3116,9 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; } ret = btrfs_balance(bctl, bargs); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9fc06e6bc51..91bbf6e774c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2102,6 +2102,30 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) kfree(bctl); } +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + return 1; +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2119,10 +2143,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 old_size; u64 size_to_free; + struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; int ret; int enospc_errors = 0; @@ -2179,8 +2206,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) break; } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != key.objectid) break; @@ -2188,7 +2217,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) if (found_key.offset == 0) break; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); btrfs_release_path(path); + if (!ret) + goto loop; + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, @@ -2197,6 +2233,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) goto error; if (ret == -ENOSPC) enospc_errors++; +loop: key.offset = found_key.offset - 1; } @@ -2227,6 +2264,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; int ret; if (btrfs_fs_closing(fs_info)) { @@ -2234,6 +2272,23 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + printk(KERN_ERR "btrfs: with mixed groups data and " + "metadata balance options must be the same\n"); + ret = -EINVAL; + goto out; + } + } + set_balance_control(bctl); mutex_unlock(&fs_info->balance_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 88258238528..003e5421606 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,17 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + struct btrfs_balance_args; struct btrfs_balance_control { struct btrfs_fs_info *fs_info; -- cgit v1.2.3 From ed25e9b26f898d8d63ae4a836489f1923534143b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: profiles filter Select chunks based on a given profile mask. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 24 ++++++++++++++++++++++++ fs/btrfs/volumes.h | 4 ++++ 2 files changed, 28 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 91bbf6e774c..447bd422d86 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2102,6 +2102,24 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) kfree(bctl); } +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->profiles & chunk_profile) + return 0; + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2123,6 +2141,12 @@ static int should_balance_chunk(struct btrfs_root *root, else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) bargs = &bctl->meta; + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 003e5421606..fb20d774044 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -196,6 +196,10 @@ struct map_lookup { #define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ BTRFS_BALANCE_SYSTEM | \ BTRFS_BALANCE_METADATA) +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit v1.2.3 From 5ce5b3c0916ba3a2e34cf648b94044adc5ef9e76 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: usage filter Select chunks that are less than X percent full. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 37 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 447bd422d86..b858242374d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2120,6 +2120,36 @@ static int chunk_profiles_filter(u64 chunk_profile, return 1; } +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor <= 0) + return 0; + if (factor >= 100) + return num; + + num *= factor; + do_div(num, 100); + return num; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2147,6 +2177,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fb20d774044..eee77fcf812 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -200,6 +200,7 @@ struct map_lookup { * Balance filters */ #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit v1.2.3 From 409d404b461afa9738619f249fd7f62a366b68c2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: devid filter Relocate chunks which have at least one stripe located on a device with devid X. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 23 +++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 24 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b858242374d..9ff5cd09a25 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2150,6 +2150,23 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, return ret; } +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2183,6 +2200,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index eee77fcf812..7cfec03c29b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -201,6 +201,7 @@ struct map_lookup { */ #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit v1.2.3 From 94e60d5a5c4b98a32b1077dec88df09ada712376 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: devid subset filter Select chunks which have at least one byte of at least one stripe located on a device with devid X in a given [pstart,pend) physical address range. This filter only works when devid filter is turned on. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 47 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9ff5cd09a25..c60071a448a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2167,6 +2167,46 @@ static int chunk_devid_filter(struct extent_buffer *leaf, return 1; } +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + factor = num_stripes / factor; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + do_div(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2206,6 +2246,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7cfec03c29b..844b08e388f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -202,6 +202,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit v1.2.3 From ea67176ae8c024f64d85ec33873e5eadf1af7247 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: virtual address space subset filter Select chunks which have at least one byte located inside a given [vstart, vend) virtual address space range. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 20 ++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 21 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c60071a448a..e86c9e4fe51 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2207,6 +2207,20 @@ static int chunk_drange_filter(struct extent_buffer *leaf, return 1; } +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2252,6 +2266,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 844b08e388f..eac26c35931 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -203,6 +203,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit v1.2.3 From 70922617b0099f420deceb53d5dc7f4fb30d08d0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: do not reduce profile in do_chunk_alloc() Every caller of do_chunk_alloc() feeds it the reduced allocation profile, so stop trying to reduce it one more time. Instead check the validity of the passed profile. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 18 ++++++++++++++++++ fs/btrfs/extent-tree.c | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4d98c8df5c..1e7aea60da2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2536,6 +2536,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } +/** + * profile_is_valid - tests whether a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static inline int profile_is_valid(u64 flags, int extended) +{ + u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + if (extended) + mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & mask) + return 0; + /* true if zero or exactly one bit set */ + return (flags & (~flags + 1)) == flags; +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 946b0671c5d..a1a18ea7b6c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3295,7 +3295,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - flags = btrfs_reduce_alloc_profile(extent_root, flags); + BUG_ON(!profile_is_valid(flags, 0)); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { -- cgit v1.2.3 From e4d8ec0f65b91bfb4984a4927632ded95f9825ad Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: implement online profile changing Profile changing is done by launching a balance with BTRFS_BALANCE_CONVERT bits set and target fields of respective btrfs_balance_args structs initialized. Profile reducing code in this case will pick restriper's target profile if it's available instead of doing a blind reduce. If target profile is not yet available it goes back to a plain reduce. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 56 +++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 5 ++++ 3 files changed, 129 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1a18ea7b6c..e6a832e3e64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3030,7 +3030,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) /* * @flags: available profiles in extended format (see ctree.h) * - * Returns reduced profile in chunk format. + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { @@ -3042,6 +3044,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + /* pick restriper's target profile if it's available */ + spin_lock(&root->fs_info->balance_lock); + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + if ((flags & BTRFS_BLOCK_GROUP_DATA) && + (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->data.target)) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && + (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->sys.target)) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && + (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->meta.target)) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + spin_unlock(&root->fs_info->balance_lock); + flags = tgt; + goto out; + } + } + spin_unlock(&root->fs_info->balance_lock); + if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); if (num_devices < 4) @@ -3065,6 +3095,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) flags &= ~BTRFS_BLOCK_GROUP_RAID0; } +out: /* extended -> chunk profile */ flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; @@ -6795,6 +6826,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + /* pick restriper's target profile and return */ + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + /* extended -> chunk profile */ + tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + return tgt; + } + } + /* * we add in the count of missing devices because we want * to make sure that any RAID levels on a degraded FS diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e86c9e4fe51..f08210e8333 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2438,6 +2438,75 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + /* + * Profile changing sanity checks. Skip them if a simple + * balance is requested. + */ + if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & + BTRFS_BALANCE_ARGS_CONVERT)) + goto do_balance; + + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (fs_info->fs_devices->num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (fs_info->fs_devices->num_devices < 4) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + else + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10); + + if (!profile_is_valid(bctl->data.target, 1) || + bctl->data.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "data profile %llu\n", + (unsigned long long)bctl->data.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->meta.target, 1) || + bctl->meta.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "metadata profile %llu\n", + (unsigned long long)bctl->meta.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->sys.target, 1) || + bctl->sys.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "system profile %llu\n", + (unsigned long long)bctl->sys.target); + ret = -EINVAL; + goto out; + } + + if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } + } + +do_balance: set_balance_control(bctl); mutex_unlock(&fs_info->balance_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index eac26c35931..79ee9c32456 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -196,6 +196,9 @@ struct map_lookup { #define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ BTRFS_BALANCE_SYSTEM | \ BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) + /* * Balance filters */ @@ -205,6 +208,8 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) + struct btrfs_balance_args; struct btrfs_balance_control { struct btrfs_fs_info *fs_info; -- cgit v1.2.3 From cfa4c961cc69ffb7bda450972320a25cbd413e19 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: soft profile changing mode (aka soft convert) When doing convert from one profile to another if soft mode is on restriper won't touch chunks that already have the profile we are converting to. This is useful if e.g. half of the FS was converted earlier. The soft mode switch is (like every other filter) per-type. This means that we can convert for example meta chunks the "hard" way while converting data chunks selectively with soft switch. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 23 +++++++++++++++++++++++ fs/btrfs/volumes.h | 6 ++++++ 2 files changed, 29 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f08210e8333..98b4067017f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2221,6 +2221,23 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } +static int chunk_soft_convert_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->target & chunk_profile) + return 1; + + return 0; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2272,6 +2289,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 79ee9c32456..6c143c98017 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -208,7 +208,13 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ #define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit v1.2.3 From 0940ebf6b92ea10a6f30ae5ac3993a3b75745da6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: save balance parameters to disk Introduce a new btree objectid for storing balance item. The reason is to be able to resume restriper after a crash with the same parameters. Balance item has a very high objectid and goes into tree of tree roots. The key for the new item is as follows: [ BTRFS_BALANCE_OBJECTID ; BTRFS_BALANCE_ITEM_KEY ; 0 ] Older kernels simply ignore it so it's safe to mount with an older kernel and then go back to the newer one. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e7aea60da2..9997a59e4f5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -86,6 +86,9 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -692,6 +695,54 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + __le64 unused[8]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -1409,6 +1460,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +#define BTRFS_BALANCE_ITEM_KEY 248 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -2103,8 +2156,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* struct btrfs_super_block */ +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); +} + +/* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 98b4067017f..4c60ca0ae90 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2073,6 +2073,97 @@ error: return ret; } +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + /* * Should be called with both balance and volume mutexes held to * serialize other volume operations (add_dev/rm_dev/resize) with @@ -2423,7 +2514,11 @@ error: static void __cancel_balance(struct btrfs_fs_info *fs_info) { + int ret; + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + BUG_ON(ret); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, @@ -2530,6 +2625,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } do_balance: + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret) + goto out; + set_balance_control(bctl); mutex_unlock(&fs_info->balance_mutex); -- cgit v1.2.3 From 596410151ed71819b9e8a8018c6c9992796b256d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: recover balance on mount On mount, if balance item is found, resume balance in a separate kernel thread. Try to be smart to continue roughly where previous balance (or convert) was interrupted. For chunk types that were being converted to some profile we turn on soft convert, in case of a simple balance we turn on usage filter and relocate only less-than-90%-full chunks of that type. These are just heuristics but they help quite a bit, and can be improved in future. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 4 ++ fs/btrfs/volumes.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 2 + 3 files changed, 139 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 190a1b24c90..eb7a11ac5b7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2427,6 +2427,10 @@ retry_root_backup: if (!err) err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + + if (!err) + err = btrfs_recover_balance(fs_info->tree_root); + if (err) { close_ctree(tree_root); return ERR_PTR(err); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c60ca0ae90..17e565388de 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "compat.h" #include "ctree.h" @@ -2164,6 +2165,46 @@ out: return ret; } +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + /* * Should be called with both balance and volume mutexes held to * serialize other volume operations (add_dev/rm_dev/resize) with @@ -2626,10 +2667,18 @@ int btrfs_balance(struct btrfs_balance_control *bctl, do_balance: ret = insert_balance_item(fs_info->tree_root, bctl); - if (ret) + if (ret && ret != -EEXIST) goto out; - set_balance_control(bctl); + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } mutex_unlock(&fs_info->balance_mutex); @@ -2646,7 +2695,89 @@ do_balance: return ret; out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else + kfree(bctl); + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_balance_control *bctl = + (struct btrfs_balance_control *)data; + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +int btrfs_recover_balance(struct btrfs_root *tree_root) +{ + struct task_struct *tsk; + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + goto out_bctl; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out_bctl; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = tree_root->fs_info; + bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); + if (IS_ERR(tsk)) + ret = PTR_ERR(tsk); + else + goto out; + +out_bctl: kfree(bctl); +out: + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6c143c98017..cd25ea58ec3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -198,6 +198,7 @@ struct map_lookup { BTRFS_BALANCE_METADATA) #define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) /* * Balance filters @@ -271,6 +272,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +int btrfs_recover_balance(struct btrfs_root *tree_root); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit v1.2.3 From 9555c6c180600b40f6e86bd4dc53bf47e06ed663 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: add skip_balance mount option Since restriper kthread starts involuntarily on mount and can suck cpu and memory bandwidth add a mount option to forcefully skip it. The restriper in that case hangs around in paused state and can be resumed from userspace when it's convenient. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 1 + fs/btrfs/super.c | 11 +++++++++-- fs/btrfs/volumes.c | 10 +++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9997a59e4f5..99eb2bcd9aa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1492,6 +1492,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 34a8b6112ea..063b521e3de 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,8 +164,9 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_err, }; static match_table_t tokens = { @@ -200,6 +201,7 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, {Opt_err, NULL}, }; @@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(root, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 17e565388de..e0160607e6e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2707,15 +2707,19 @@ static int balance_kthread(void *data) struct btrfs_balance_control *bctl = (struct btrfs_balance_control *)data; struct btrfs_fs_info *fs_info = bctl->fs_info; - int ret; + int ret = 0; mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); set_balance_control(bctl); - printk(KERN_INFO "btrfs: continuing balance\n"); - ret = btrfs_balance(bctl, NULL); + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + printk(KERN_INFO "btrfs: force skipping balance\n"); + } else { + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + } mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); -- cgit v1.2.3 From 837d5b6e46d1a4af5b6cc8f2fe83cb5de79a2961 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: allow for pausing restriper Implement an ioctl for pausing restriper. This pauses the relocation, but balance is still considered to be "in progress": balance item is not deleted, other volume operations cannot be started, etc. If paused in the middle of profile changing operation we will continue making allocations with the target profile. Add a hook to close_ctree() to pause restriper and free its data structures on unmount. (It's safe to unmount when restriper is in "paused" state, we will resume with the same parameters on the next mount) Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/ioctl.c | 28 +++++++++++++++++++++++++++- fs/btrfs/ioctl.h | 7 +++++++ fs/btrfs/volumes.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 1 + 6 files changed, 94 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 99eb2bcd9aa..1afda75d541 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1214,7 +1214,10 @@ struct btrfs_fs_info { /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; unsigned data_chunk_allocations; unsigned metadata_ratio; @@ -2658,6 +2661,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb7a11ac5b7..8ce83740780 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2004,7 +2004,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2980,6 +2983,9 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(root->fs_info); + btrfs_scrub_cancel(root); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 29b3a94933f..f572c53dda4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3072,6 +3072,11 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, bargs->flags = bctl->flags; + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); @@ -3103,6 +3108,11 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) bargs = NULL; } + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { ret = -ENOMEM; @@ -3123,7 +3133,8 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance + * bctl is freed in __cancel_balance or in free_fs_info if + * restriper was paused all the way until unmount */ if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) @@ -3138,6 +3149,19 @@ out: return ret; } +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + } + + return -EINVAL; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3216,6 +3240,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_progress(root, argp); case BTRFS_IOC_BALANCE_V2: return btrfs_ioctl_balance(root, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index c8b37d2c0d7..e972e11a8d7 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,9 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* balance control ioctl modes */ +#define BTRFS_BALANCE_CTL_PAUSE 1 + /* * this is packed, because it should be exactly the same as its disk * byte order counterpart (struct btrfs_disk_balance_args) @@ -137,6 +140,9 @@ struct btrfs_balance_progress { __u64 completed; /* # of chunks relocated so far */ }; +#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) +#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) + struct btrfs_ioctl_balance_args { __u64 flags; /* in/out */ __u64 state; /* out */ @@ -315,6 +321,7 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_fs_info_args) #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e0160607e6e..d32660ce753 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2492,6 +2492,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + if (atomic_read(&fs_info->balance_pause_req)) { + ret = -ECANCELED; + goto error; + } + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2553,6 +2558,11 @@ error: return ret; } +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + return atomic_read(&fs_info->balance_pause_req) == 0; +} + static void __cancel_balance(struct btrfs_fs_info *fs_info) { int ret; @@ -2575,7 +2585,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, u64 allowed; int ret; - if (btrfs_fs_closing(fs_info)) { + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req)) { ret = -EINVAL; goto out; } @@ -2680,18 +2691,25 @@ do_balance: spin_unlock(&fs_info->balance_lock); } + atomic_inc(&fs_info->balance_running); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); if (bargs) { memset(bargs, 0, sizeof(*bargs)); update_ioctl_balance_args(fs_info, bargs); } - __cancel_balance(fs_info); + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); return ret; out: @@ -2785,6 +2803,35 @@ out: return ret; } +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cd25ea58ec3..80953afb12b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -273,6 +273,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit v1.2.3 From a7e99c691af553fc15ac46a51f130b7c59a65f76 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: allow for canceling restriper Implement an ioctl for canceling restriper. Currently we wait until relocation of the current block group is finished, in future this can be done by triggering a commit. Balance item is deleted and no memory about the interrupted balance is kept. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 4 ++++ fs/btrfs/ioctl.h | 2 ++ fs/btrfs/volumes.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/volumes.h | 1 + 6 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1afda75d541..dfc136cc07d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1216,6 +1216,7 @@ struct btrfs_fs_info { struct mutex balance_mutex; atomic_t balance_running; atomic_t balance_pause_req; + atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8ce83740780..c23b82d8ec0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2006,6 +2006,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->balance_mutex); atomic_set(&fs_info->balance_running, 0); atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f572c53dda4..60852217ce9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3076,6 +3076,8 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); @@ -3157,6 +3159,8 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) switch (cmd) { case BTRFS_BALANCE_CTL_PAUSE: return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); } return -EINVAL; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e972e11a8d7..cd19d10794b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -111,6 +111,7 @@ struct btrfs_ioctl_fs_info_args { /* balance control ioctl modes */ #define BTRFS_BALANCE_CTL_PAUSE 1 +#define BTRFS_BALANCE_CTL_CANCEL 2 /* * this is packed, because it should be exactly the same as its disk @@ -142,6 +143,7 @@ struct btrfs_balance_progress { #define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) #define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) +#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) struct btrfs_ioctl_balance_args { __u64 flags; /* in/out */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d32660ce753..c32667318ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2492,7 +2492,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - if (atomic_read(&fs_info->balance_pause_req)) { + if (atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { ret = -ECANCELED; goto error; } @@ -2560,7 +2561,10 @@ error: static inline int balance_need_close(struct btrfs_fs_info *fs_info) { - return atomic_read(&fs_info->balance_pause_req) == 0; + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); } static void __cancel_balance(struct btrfs_fs_info *fs_info) @@ -2586,7 +2590,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int ret; if (btrfs_fs_closing(fs_info) || - atomic_read(&fs_info->balance_pause_req)) { + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { ret = -EINVAL; goto out; } @@ -2832,6 +2837,42 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) return ret; } +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 80953afb12b..caa9abd218e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -274,6 +274,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_recover_balance(struct btrfs_root *tree_root); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit v1.2.3 From de322263d3a6d4ffd4ed7c4d0c6536e9497aec9b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: allow for resuming restriper after it was paused Recognize BTRFS_BALANCE_RESUME flag passed from userspace. We use the same heuristics used when recovering balance after a crash to try to start where we left off last time. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 60852217ce9..85e546ffe3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3106,6 +3106,20 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) ret = PTR_ERR(bargs); goto out; } + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_bargs; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + goto do_balance; + } } else { bargs = NULL; } @@ -3133,6 +3147,7 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) bctl->flags |= BTRFS_BALANCE_TYPE_MASK; } +do_balance: ret = btrfs_balance(bctl, bargs); /* * bctl is freed in __cancel_balance or in free_fs_info if -- cgit v1.2.3 From 19a39dce3b9bf0244d19a446718ad6f7605ff099 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: add balance progress reporting Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/ioctl.h | 2 ++ fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++++++++++++---- fs/btrfs/volumes.h | 3 +++ 4 files changed, 84 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 85e546ffe3c..1e7a9bac31a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3065,7 +3065,7 @@ out: return ret; } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -3082,6 +3082,14 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } } static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) @@ -3181,6 +3189,39 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) return -EINVAL; } +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3261,6 +3302,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance(root, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index cd19d10794b..4f69028a68c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -324,6 +324,8 @@ struct btrfs_ioctl_logical_ino_args { #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ struct btrfs_ioctl_balance_args) #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c32667318ae..d73439b4d7d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2441,6 +2441,7 @@ static u64 div_factor(u64 num, int factor) static int __btrfs_balance(struct btrfs_fs_info *fs_info) { + struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_root *dev_root = fs_info->dev_root; struct list_head *devices; @@ -2456,6 +2457,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int slot; int ret; int enospc_errors = 0; + bool counting = true; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -2487,12 +2489,18 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = -ENOMEM; goto error; } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - if (atomic_read(&fs_info->balance_pause_req) || + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || atomic_read(&fs_info->balance_cancel_req)) { ret = -ECANCELED; goto error; @@ -2529,24 +2537,47 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) goto loop; + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) goto error; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } loop: key.offset = found_key.offset - 1; } + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } error: btrfs_free_path(path); if (enospc_errors) { @@ -2576,7 +2607,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) BUG_ON(ret); } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs); /* @@ -2706,7 +2737,7 @@ do_balance: if (bargs) { memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, bargs); + update_ioctl_balance_args(fs_info, 0, bargs); } if ((ret && ret != -ECANCELED && ret != -ENOSPC) || diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index caa9abd218e..6faec9dd1f9 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -218,6 +218,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) struct btrfs_balance_args; +struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_fs_info *fs_info; @@ -226,6 +227,8 @@ struct btrfs_balance_control { struct btrfs_balance_args sys; u64 flags; + + struct btrfs_balance_progress stat; }; int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, -- cgit v1.2.3 From 7ad85bb76a61801362701b77c5cee5aa09f35369 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Jan 2012 19:10:12 -0500 Subject: Btrfs: do not use btrfs_end_transaction_throttle everywhere A user reported a problem where things like open with O_CREAT would take up to 30 seconds when he had nfs activity on the same mount. This is because all of our quick metadata operations, like create, symlink etc all do btrfs_end_transaction_throttle, which if the transaction is blocked will wait for the commit to complete before it returns. This adds a ridiculous amount of latency and isn't really needed. The normal btrfs_end_transaction will mark the transaction as blocked and wake the transaction kthread up if it thinks the transaction needs to end (this being in the running out of global reserve space scenario), and this is all that is really needed since we've already done everything we're going to do, we just need to return. This should help people with the latency they were seeing when using synchronous heavy workloads. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 18 +++++++++--------- fs/btrfs/xattr.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index acc4ff39ca4..5f8ba210c0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2845,7 +2845,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3434,7 +3434,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); } else { /* @@ -4655,7 +4655,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); @@ -4723,7 +4723,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4782,7 +4782,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -4848,7 +4848,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_fail: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -6668,7 +6668,7 @@ end_trans: err = ret; nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); + ret = btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); } @@ -7075,7 +7075,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); } out_fail: - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); @@ -7247,7 +7247,7 @@ out_unlock: if (!err) d_instantiate(dentry, inode); nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3848b04e310..e7a5659087e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); return ret; } -- cgit v1.2.3 From f70a9a6b94af86fca069a7552ab672c31b457786 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 12 Jan 2012 19:10:12 -0500 Subject: Btrfs: fix btrfsck error 400 when truncating a compressed Reproduce steps: # mkfs.btrfs /dev/sdb5 # mount /dev/sdb5 -o compress=lzo /mnt # dd if=/dev/zero of=/mnt/tmpfile bs=128K count=1 # sync # truncate -s 64K /mnt/tmpfile root 5 inode 257 errors 400 This is because of the wrong if condition, which is used to check if we should subtract the bytes of the dropped range from i_blocks/i_bytes of i-node or not. When we truncate a compressed extent, btrfs substracts the bytes of the whole extent, it's wrong. We should substract the real size that we truncate, no matter it is a compressed extent or not. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f8ba210c0a..946a7f1b329 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3009,7 +3009,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; - int encoding; int ret; int err = 0; u64 ino = btrfs_ino(inode); @@ -3059,7 +3058,6 @@ search_again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - encoding = 0; if (found_key.objectid != ino) break; @@ -3072,10 +3070,6 @@ search_again: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - encoding = btrfs_file_extent_compression(leaf, fi); - encoding |= btrfs_file_extent_encryption(leaf, fi); - encoding |= btrfs_file_extent_other_encoding(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); @@ -3103,7 +3097,7 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item && !encoding) { + if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = new_size - -- cgit v1.2.3 From ec39e180fd3188c983c94603634bfcd019f42ae7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Jan 2012 19:10:12 -0500 Subject: Btrfs: release space on error in page_mkwrite If updating the inode gave us an ENOSPC we were just returning in page_mkwrite, which is a problem since we make our reservation right before trying to update the inode, so fix the out label so that we actually free our reservation. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 946a7f1b329..85fd86ea983 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6488,8 +6488,8 @@ out_unlock: if (!ret) return VM_FAULT_LOCKED; unlock_page(page); - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); return ret; } -- cgit v1.2.3 From 45a8090e626ab470c91142954431a93846030b0d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Jan 2012 19:10:12 -0500 Subject: Btrfs: don't call btrfs_throttle in file write Btrfs_throttle will make us wait if there is a currently committing transaction until we can open new transactions, which is ridiculous since we don't actually start any transactions within the file write path anyway, so all this does is introduce big latencies if we have a sync/fsync heavy workload going on while somebody else is trying to do work. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fc97b00bd87..0f61e11a299 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1273,7 +1273,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); pos += copied; num_written += copied; -- cgit v1.2.3 From 3f7de037fb3727b20bc27332cdcf2488b702394c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 10 Nov 2011 08:29:20 -0500 Subject: Btrfs: add allocator tracepoints I used these tracepoints when figuring out what the cluster stuff was doing, so add them to mainline in case we need to profile this stuff again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 +++++++++ fs/btrfs/free-space-cache.c | 12 +++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a44072a692a..ad1a20bc834 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5256,6 +5256,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; + trace_find_free_extent(orig_root, num_bytes, empty_size, data); + space_info = __find_space_info(root->fs_info, data); if (!space_info) { printk(KERN_ERR "No space info for %llu\n", data); @@ -5432,6 +5434,8 @@ alloc: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, num_bytes); goto checks; } @@ -5490,6 +5494,9 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, + num_bytes); goto checks; } } else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -5576,6 +5583,8 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; + trace_btrfs_reserve_extent(orig_root, block_group, + search_start, num_bytes); if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6c7887a7770..efe20032e4a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2346,6 +2346,8 @@ again: &entry->offset_index, 1); BUG_ON(ret); + trace_btrfs_setup_cluster(block_group, cluster, + total_found * block_group->sectorsize, 1); return 0; } @@ -2368,6 +2370,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, u64 window_start; u64 window_free; u64 max_extent; + u64 total_size = 0; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2433,11 +2436,12 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); + total_size += entry->bytes; BUG_ON(ret); } while (node && entry != last); cluster->max_size = max_extent; - + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); return 0; } @@ -2542,6 +2546,10 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); @@ -2559,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); -- cgit v1.2.3 From 90290e19820e3323ce6b9c2888eeb68bf29c278b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 2 Dec 2011 15:44:12 -0500 Subject: Btrfs: protect orphan block rsv with spin_lock We've been seeing warnings coming out of the orphan commit stuff forever from ceph. Turns out it's because we're racing with checking if the orphan block reserve is set, because we clear it outside of the spin_lock. So leave the normal fastpath checks where they are, but take the spin_lock and _recheck_ to make sure we haven't had an orphan block rsv added in the meantime. Then clear the root's orphan block rsv and release the lock. With this patch a user said the warnings went away and they usually showed up pretty soon after he started ceph. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 85fd86ea983..619742d3716 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state { void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_block_rsv *block_rsv; int ret; if (!list_empty(&root->orphan_list) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; + spin_lock(&root->orphan_lock); + if (!list_empty(&root->orphan_list)) { + spin_unlock(&root->orphan_lock); + return; + } + + if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { + spin_unlock(&root->orphan_lock); + return; + } + + block_rsv = root->orphan_block_rsv; + root->orphan_block_rsv = NULL; + spin_unlock(&root->orphan_lock); + if (root->orphan_item_inserted && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, @@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_item_inserted = 0; } - if (root->orphan_block_rsv) { - WARN_ON(root->orphan_block_rsv->size > 0); - btrfs_free_block_rsv(root, root->orphan_block_rsv); - root->orphan_block_rsv = NULL; + if (block_rsv) { + WARN_ON(block_rsv->size > 0); + btrfs_free_block_rsv(root, block_rsv); } } -- cgit v1.2.3 From 8c2a3ca20f6233677ac3222c6506174010eb414f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 10 Jan 2012 10:31:31 -0500 Subject: Btrfs: space leak tracepoints This in addition to a script in my btrfs-tracing tree will help track down space leaks when we're getting space left over in block groups on umount. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/delayed-inode.c | 45 +++++++++++++++++++++++++++---------- fs/btrfs/extent-tree.c | 58 +++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/inode-map.c | 4 ++++ fs/btrfs/transaction.c | 2 ++ 4 files changed, 89 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9c1eccc2c50..fe4cd0f1cef 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, + num_bytes, 1); item->bytes_reserved = num_bytes; + } return ret; } @@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, item->bytes_reserved, + 0); btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - int release = false; + bool release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (ret == -EAGAIN) ret = -ENOSPC; - if (!ret) + if (!ret) { node->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "delayed_inode", + btrfs_ino(inode), + num_bytes, 1); + } return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); @@ -707,11 +719,17 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; + } - if (release) + if (release) { + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, src_rsv, num_bytes); + } return ret; } @@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); - delayed_item->key.objectid = btrfs_ino(dir); btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); delayed_item->key.offset = index; @@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item->type = type; memcpy((char *)(dir_item + 1), name, name_len); + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ad1a20bc834..556f9aa25bb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3310,6 +3310,8 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)data_sinfo, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3329,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)data_sinfo, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3686,6 +3690,10 @@ again: if (used <= space_info->total_bytes) { if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3753,6 +3761,10 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3859,7 +3871,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; @@ -3895,6 +3908,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4051,7 +4067,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, if (global_rsv->full || global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, + num_bytes); } /* @@ -4110,11 +4127,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4149,7 +4170,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, + (u64)-1); WARN_ON(fs_info->delalloc_block_rsv.size > 0); WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); @@ -4166,6 +4188,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; + trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -4183,6 +4207,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * when we are truly done with the orphan item. */ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4190,6 +4216,8 @@ void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4370,8 +4398,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (to_free) + if (to_free) { btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, + "delalloc", + btrfs_ino(inode), + to_free, 0); + } return ret; } @@ -4383,6 +4416,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); + if (to_reserve) + trace_btrfs_space_reservation(root->fs_info,"delalloc", + btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; @@ -4412,6 +4448,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4666,7 +4704,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { - BUG_ON(space_info->bytes_may_use < num_bytes); + trace_btrfs_space_reservation(cache->fs_info, + "space_info", + (u64)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } @@ -6126,10 +6167,11 @@ use_block_rsv(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOSPC); } -static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); } /* @@ -6159,7 +6201,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, empty_size, hint, (u64)-1, &ins, 0); if (ret) { - unuse_block_rsv(block_rsv, blocksize); + unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f8962a957d6..213ffa86ce1 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; + trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -498,6 +500,8 @@ again: out_put: iput(inode); out_release: + trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d5f987b49d7..287a6728b1a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -326,6 +326,8 @@ again: } if (num_bytes) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)h, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } -- cgit v1.2.3 From f248679e86fead40cc78e724c7181d6bec1a2046 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Jan 2012 12:09:22 -0500 Subject: Btrfs: add a delalloc mutex to inodes for delalloc reservations I was using i_mutex for this, but we're getting bogus lockdep warnings by doing that and theres no real way to get rid of those, so just stop using i_mutex to protect delalloc metadata reservations and use a delalloc mutex instead. This shouldn't be contended often at all, only if you are writing and mmap writing to the file at the same time. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 3 +++ fs/btrfs/extent-tree.c | 5 +++-- fs/btrfs/inode.c | 11 +---------- fs/btrfs/ioctl.c | 2 -- fs/btrfs/relocation.c | 2 -- 5 files changed, 7 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 634608d2a6d..9b9b15fd520 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -51,6 +51,9 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* held while doing delalloc reservations */ + struct mutex delalloc_mutex; + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 556f9aa25bb..e0ad5f0f895 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4345,12 +4345,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(root, inode)) flush = 0; - else - WARN_ON(!mutex_is_locked(&inode->i_mutex)); if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4405,6 +4404,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); } + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4415,6 +4415,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(root->fs_info,"delalloc", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 619742d3716..5977987abdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2239,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; - /* - * Need to hold the imutex for reservation purposes, not - * a huge deal here but I have a WARN_ON in - * btrfs_delalloc_reserve_space to catch offenders. - */ - mutex_lock(&inode->i_mutex); ret = btrfs_truncate(inode); - mutex_unlock(&inode->i_mutex); } else { nr_unlink++; } @@ -6411,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; - /* Need this to keep space reservations serialized */ - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - mutex_unlock(&inode->i_mutex); if (!ret) ret = btrfs_update_time(vma->vm_file); if (ret) { @@ -6758,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); mutex_init(&ei->log_mutex); + mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7fdf22c2dc0..6834be4c870 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -868,10 +868,8 @@ static int cluster_pages_for_defrag(struct inode *inode, return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); - mutex_unlock(&inode->i_mutex); if (ret) return ret; again: diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index efe9f792544..8c1aae2c845 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2949,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); - mutex_unlock(&inode->i_mutex); if (ret) goto out; -- cgit v1.2.3 From 96bdc7dc61fb1b1e8e858dafb13abee8482ba064 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 16 Jan 2012 08:13:11 -0500 Subject: Btrfs: use larger system chunks system chunks by default are very small. This makes them slightly larger and also fixes the conditional checks to make sure we don't allocate a billion of them at once. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 +++ fs/btrfs/volumes.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e0ad5f0f895..700879ed64c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3384,6 +3384,9 @@ static int should_alloc_chunk(struct btrfs_root *root, /* 256MB or 2% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); + /* system chunks need a much small threshold */ + if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) + thresh = 32 * 1024 * 1024; if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 59e878f9fdc..7ffdb154dae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3166,7 +3166,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 8 * 1024 * 1024; + max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", -- cgit v1.2.3 From b1375d64c539c5b76794be759b62d3f178e67c32 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: fix uninit warning in backref.c Added initialization with the declaration of ret. It isn't set later on the switch-default branch (which should never be taken). Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b9a843226de..633c701a287 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -297,7 +297,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; int sgn; - int ret; + int ret = 0; if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(info_key, &extent_op->key); @@ -392,7 +392,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_key *info_key, int *info_level, struct list_head *prefs) { - int ret; + int ret = 0; int slot; struct extent_buffer *leaf; struct btrfs_key key; -- cgit v1.2.3 From 357b9784b79924a31ccded5d9a0c688f48cc28f2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: make sure a bitmap has enough bytes We have only been checking for min_bytes available in bitmap entries, but we won't successfully setup a bitmap cluster unless it has at least bytes in the bitmap, so in the common case min_bytes is 4k and we want something like 2MB, so if there are a bunch of bitmap entries with less than 2mb's in them, we'll search all them anyway, which is suboptimal. Fix this check. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index efe20032e4a..6e740693234 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2475,7 +2475,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } list_for_each_entry(entry, bitmaps, list) { - if (entry->bytes < min_bytes) + if (entry->bytes < bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, bytes, cont1_bytes, min_bytes); -- cgit v1.2.3 From 6dd70ce4eb7429c2ba6dd9fa46f78a0a2a254038 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: btrfs: Fix busyloops in transaction waiting code wait_log_commit() and wait_for_writer() were using slightly different conditions for deciding whether they should call schedule() and whether they should continue in the wait loop. Thus it could happen that we busylooped when the first condition was not true while the second one was. That is burning CPU cycles needlessly and is deadly on UP machines... Signed-off-by: Jan Kara Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cb877e0886a..966cc74f5d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->log_transid < transid + 2 && + } while (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); return 0; } @@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (atomic_read(&root->log_writers)) { + while (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); -- cgit v1.2.3 From 8bedd51b6121c4607784d75f852828d25d119c52 Mon Sep 17 00:00:00 2001 From: Mitch Harder Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: Check for NULL page in extent_range_uptodate A user has encountered a NULL pointer kernel oops in btrfs when encountering media errors. The problem has been identified as an unhandled NULL pointer returned from find_get_page(). This modification simply checks for a NULL page, and returns with an error if found (the extent_range_uptodate() function returns 1 on errors). After testing this patch, the user reported that the error with the NULL pointer oops was solved. However, there is still a remaining problem with a thread becoming stuck in wait_on_page_locked(page) in the read_extent_buffer_pages(...) function in extent_io.c for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); wait_on_page_locked(page); if (!PageUptodate(page)) ret = -EIO; } This patch leaves the issue with the locked page yet to be resolved. Signed-off-by: Mitch Harder Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9d09a4f8187..fcf77e1ded4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3909,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree, while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); + if (!page) + return 1; uptodate = PageUptodate(page); page_cache_release(page); if (!uptodate) { -- cgit v1.2.3 From 0b4a9d248f88e6773312f262e8185f23863d984a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: use cluster->window_start when allocating from a cluster bitmap We specifically set window_start in the cluster struct to indicate where the cluster starts in a bitmap, but we've been using min_start to indicate where we're searching from. This is usually the start of the blockgroup, so essentially means we're constantly searching from the start of any bitmap we find, which completely negates all the trouble we go to in order to setup a cluster. So start using window_start to make sure we actually use the area we found. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6e740693234..61447a51f64 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2242,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, if (entry->bitmap) { ret = btrfs_alloc_from_bitmap(block_group, cluster, entry, bytes, - min_start); + cluster->window_start); if (ret == 0) { node = rb_next(&entry->offset_index); if (!node) -- cgit v1.2.3 From 0b485143d835c019cddc45f46e4b3873dcc9aa4e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: fix warning for 32-bit build of fs/btrfs/check-integrity.c There have been 4 warnings on 32-bit build, they are herewith fixed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ad0b3ba735b..b669a7d8e49 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1662,7 +1662,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, &state->block_hashtable); if (NULL != block) { - u64 bytenr; + u64 bytenr = 0; struct list_head *elem_ref_to; struct list_head *tmp_ref_to; @@ -2777,9 +2777,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) printk(KERN_INFO "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," " size=%lu, data=%p, bdev=%p)\n", - rw, bh->b_blocknr, - (unsigned long long)dev_bytenr, bh->b_size, - bh->b_data, bh->b_bdev); + rw, (unsigned long)bh->b_blocknr, + (unsigned long long)dev_bytenr, + (unsigned long)bh->b_size, bh->b_data, + bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, bh->b_data, bh->b_size, NULL, NULL, bh, rw); @@ -2844,7 +2845,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) printk(KERN_INFO "submit_bio(rw=0x%x, bi_vcnt=%u," " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, bio->bi_sector, + rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, (unsigned long long)dev_bytenr, bio->bi_bdev); -- cgit v1.2.3 From 7ec31b548a17f773ab6289e795ed3a6820e8b56e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: Btrfs: do not defrag a file partially xfstests 218 complains that btrfs defrags a file partially: After: 1 Write backwards sync, but contiguous - should defrag to 1 extent Before: 10 -After: 1 +After: 2 To fix this, we need to set max_to_defrag count properly. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6834be4c870..0b06a5ca8af 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1066,7 +1066,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index; + max_to_defrag = last_index + 1; /* * make writeback starts from i, so the defrag range can be -- cgit v1.2.3 From 9e622d6bea0202e9fe267955362c01918562c09b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: Btrfs: fix enospc error caused by wrong checks of the chunk When we did sysbench test for inline files, enospc error happened easily though there was lots of free disk space which could be allocated for new chunks. Reproduce steps: # mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) # mount /mnt # ulimit -n 102400 # cd /mnt # sysbench --num-threads=1 --test=fileio --file-num=81920 \ > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \ > --file-test-mode=seqwr prepare # sysbench --num-threads=1 --test=fileio --file-num=81920 \ > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \ > --file-test-mode=seqwr run The reason of this bug is: Now, we can reserve space which is larger than the free space in the chunks if we have enough free disk space which can be used for new chunks. By this way, the space allocator should allocate a new chunk by force if there is no free space in the free space cache. But there are two wrong checks which break this operation. One is if (ret == -ENOSPC && num_bytes > min_alloc_size) in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk even we fail to allocate free space by minimum allocable size. The other is if (space_info->force_alloc) force = space_info->force_alloc; in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen. Fix these two wrong checks. Especially the second one, we fix it by changing the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has higher priority. And if the value which is passed in by the caller is greater than ->force_alloc, use the passed value. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 700879ed64c..283af7a676a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,23 +34,24 @@ #include "locking.h" #include "free-space-cache.h" -/* control flags for do_chunk_alloc's force field +/* + * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk * if we really need one. * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * * CHUNK_ALLOC_LIMITED means to only try and allocate one * if we have very few chunks already allocated. This is * used as part of the clustering code to help make sure * we have a good pool of storage to cluster in, without * filling the FS with empty chunks * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * */ enum { CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_FORCE = 1, - CHUNK_ALLOC_LIMITED = 2, + CHUNK_ALLOC_LIMITED = 1, + CHUNK_ALLOC_FORCE = 2, }; /* @@ -3414,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, again: spin_lock(&space_info->lock); - if (space_info->force_alloc) + if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { spin_unlock(&space_info->lock); @@ -5794,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data) { + bool final_tried = false; int ret; u64 search_start = 0; @@ -5813,22 +5815,25 @@ again: search_start, search_end, hint_byte, ins, data); - if (ret == -ENOSPC && num_bytes > min_alloc_size) { - num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); - num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - goto again; - } - if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { - struct btrfs_space_info *sinfo; - - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + if (ret == -ENOSPC) { + if (!final_tried) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, CHUNK_ALLOC_FORCE); + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes, 1); + } } trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); -- cgit v1.2.3 From 0c4e538bccc106872d31b1514570b4dac95fb7f2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: btrfs: mask out gfp flags in releasepage btree_releasepage is a callback and can be passed unknown gfp flags and then they may end up in kmem_cache_alloc called from alloc_extent_state, slab allocator will BUG_ON when there is HIGHMEM or DMA32 flag set. This may happen when btrfs is mounted from a loop device, which masks out __GFP_IO flag. The check in try_release_extent_state 3399 if ((mask & GFP_NOFS) == GFP_NOFS) 3400 mask = GFP_NOFS; will not work and passes unfiltered flags further resulting in crash at mm/slab.c:2963 [<000000000024ae4c>] cache_alloc_refill+0x3b4/0x5c8 [<000000000024c810>] kmem_cache_alloc+0x204/0x294 [<00000000001fd3c2>] mempool_alloc+0x52/0x170 [<000003c000ced0b0>] alloc_extent_state+0x40/0xd4 [btrfs] [<000003c000cee5ae>] __clear_extent_bit+0x38a/0x4cc [btrfs] [<000003c000cee78c>] try_release_extent_state+0x9c/0xd4 [btrfs] [<000003c000cc4c66>] btree_releasepage+0x7e/0xd0 [btrfs] [<0000000000210d84>] shrink_page_list+0x6a0/0x724 [<0000000000211394>] shrink_inactive_list+0x230/0x578 [<0000000000211bb8>] shrink_list+0x6c/0x120 [<0000000000211e4e>] shrink_zone+0x1e2/0x228 [<0000000000211f24>] shrink_zones+0x90/0x254 [<0000000000213410>] do_try_to_free_pages+0xac/0x420 [<0000000000213ae0>] try_to_free_pages+0x13c/0x1b0 [<0000000000204e6c>] __alloc_pages_nodemask+0x5b4/0x9a8 [<00000000001fb04a>] grab_cache_page_write_begin+0x7e/0xe8 Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index da4457f84d7..4c867112b4c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -961,6 +961,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; + /* + * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing + * slab allocation from alloc_extent_state down the callchain where + * it'd hit a BUG_ON as those flags are not allowed. + */ + gfp_flags &= ~GFP_SLAB_BUG_MASK; + ret = try_release_extent_state(map, tree, page, gfp_flags); if (!ret) return 0; -- cgit v1.2.3 From 9b23062840e7c685ef0a0b561285d6e3a3b6811b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: Btrfs: advance window_start if we're using a bitmap If we span a long area in a bitmap we could end up taking a lot of time searching to the next free area if we're searching from the original window_start, so advance window_start in order to make sure we don't do any superficial searching. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 61447a51f64..5802b1473c3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2251,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, offset_index); continue; } + cluster->window_start += bytes; } else { ret = entry->offset; -- cgit v1.2.3 From 9998eb703490589c3e8f1bf09b15203156776edb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jan 2012 13:47:40 -0500 Subject: Btrfs: fix reservations in btrfs_page_mkwrite Josef fixed btrfs_page_mkwrite to properly release reserved extents if there was an error. But if we fail to get a reservation and we fail to dirty the inode (for ENOSPC reasons), we'll end up trying to release a reservation we never had. This makes sure we only release if we were able to reserve. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5977987abdb..7405753ec5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6401,18 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long zero_start; loff_t size; int ret; + int reserved = 0; u64 page_start; u64 page_end; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - if (!ret) + if (!ret) { ret = btrfs_update_time(vma->vm_file); + reserved = 1; + } if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; else /* -ENOSPC, -EIO, etc */ ret = VM_FAULT_SIGBUS; - goto out; + if (reserved) + goto out; + goto out_noreserve; } ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ @@ -6495,6 +6500,7 @@ out_unlock: unlock_page(page); out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve: return ret; } -- cgit v1.2.3 From d98456fcafa6f3fd1985f9b7429aaa3531c6bfa0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Jan 2012 20:27:41 -0500 Subject: Btrfs: don't reserve data with extents locked in btrfs_fallocate btrfs_fallocate tries to allocate space only if ranges in the file don't already exist. But the enospc checks it does are not allowed with extents locked. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f61e11a299..0621a3a7d5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,6 +1603,14 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, len); + if (ret) + return ret; + /* * wait for ordered IO before we have any locks. We'll loop again * below with the locks held. @@ -1666,27 +1674,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - - /* - * Make sure we have enough space before we do the - * allocation. - */ - ret = btrfs_check_data_free_space(inode, last_byte - - cur_offset); - if (ret) { - free_extent_map(em); - break; - } - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); - /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, last_byte - - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1714,6 +1707,8 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, len); return ret; } -- cgit v1.2.3 From 934e7d44b810691ae5aefa3308b97a402aac1a55 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 7 Feb 2012 22:21:45 +0900 Subject: btrfs: Fix typo in free-space-cache.c Correct spelling "cace" to "cache" in fs/btrfs/free-space-cache.c Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c2f20594c9f..7f4f3025357 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1067,7 +1067,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); ret = 0; #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free space cace " + printk(KERN_ERR "btrfs: failed to write free space cache " "for block group %llu\n", block_group->key.objectid); #endif } -- cgit v1.2.3 From 3e93b8dfd9dd8735152e59913a2bde226f83d43e Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 5 Feb 2012 01:29:47 +0100 Subject: BTRFS: Don't include disk-io.h twice in check-integrity.c Once should be enough. Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- fs/btrfs/check-integrity.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b669a7d8e49..064b29bd160 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -89,7 +89,6 @@ #include "disk-io.h" #include "transaction.h" #include "extent_io.h" -#include "disk-io.h" #include "volumes.h" #include "print-tree.h" #include "locking.h" -- cgit v1.2.3 From 941b2ddf71987ef369389517a7e215dd505fe01e Mon Sep 17 00:00:00 2001 From: Keith Mannthey Date: Tue, 29 Nov 2011 17:44:12 -0800 Subject: btrfs: Sector Size check during Mount Gracefully fail when trying to mount a BTRFS file system that has a sectorsize smaller than PAGE_SIZE. On PPC it is possible to build a FS while using a 4k PAGE_SIZE kernel then boot into a 64K PAGE_SIZE kernel. Presently open_ctree fails in an endless loop and hangs the machine in this situation. My debugging has show this Sector size < Page size to be a non trivial situation and a graceful exit from the situation would be nice for the time being. Signed-off-by: Keith Mannthey --- fs/btrfs/disk-io.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c867112b4c..58d0678dfcb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2258,6 +2258,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_sb_buffer; } + if (sectorsize < PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size " + "found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); -- cgit v1.2.3 From 8f24b49688281a77e8331894ed407f0cfe732303 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 8 Feb 2012 16:01:01 +0100 Subject: Btrfs: avoid positive number with ERR_PTR inode_ref_info() returns 1 when the element wasn't found and < 0 on error, just like btrfs_search_slot(). In iref_to_path() it's an error when the inode ref can't be found, thus we return ERR_PTR(ret) in that case. In order to avoid ERR_PTR(1), we now set ret to -ENOENT in that case. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 633c701a287..98f6bf10bbd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -892,6 +892,8 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, if (eb != eb_in) free_extent_buffer(eb); ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret > 0) + ret = -ENOENT; if (ret) break; next_inum = found_key.offset; -- cgit v1.2.3 From 6af021d8fc3bcce790e7fbb391e39c5920fa3f71 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Thu, 9 Feb 2012 14:25:50 +0800 Subject: Btrfs: return the internal error unchanged if btrfs_get_extent_fiemap() call failed for SEEK_DATA/SEEK_HOLE inquiry Given that ENXIO only means "offset beyond EOF" for either SEEK_DATA or SEEK_HOLE inquiry in a desired file range, so we should return the internal error unchanged if btrfs_get_extent_fiemap() call failed, rather than ENXIO. Cc: Dave Chinner Signed-off-by: Jie Liu --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0621a3a7d5d..3a520899b02 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1755,7 +1755,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) start - root->sectorsize, root->sectorsize, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); goto out; } last_end = em->start + em->len; @@ -1767,7 +1767,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) while (1) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); break; } -- cgit v1.2.3 From 2cac13e41bf5b99ffc426bd28dfd2248df1dfa67 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 9 Feb 2012 18:17:41 +0800 Subject: Btrfs: fix trim 0 bytes after a device delete A user reported a bug of btrfs's trim, that is we will trim 0 bytes after a device delete. The reproducer: $ mkfs.btrfs disk1 $ mkfs.btrfs disk2 $ mount disk1 /mnt $ fstrim -v /mnt $ btrfs device add disk2 /mnt $ btrfs device del disk1 /mnt $ fstrim -v /mnt This is because after we delete the device, the block group may start from a non-zero place, which will confuse trim to discard nothing. Reported-by: Lutz Euler Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 283af7a676a..60bfe2d6854 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7886,9 +7886,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret = 0; - cache = btrfs_lookup_block_group(fs_info, range->start); + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); while (cache) { if (cache->key.objectid >= (range->start + range->len)) { -- cgit v1.2.3 From 859acaf1a29bbacf6256f1159210c8d6df992b33 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 9 Feb 2012 15:09:02 +0100 Subject: btrfs: don't check DUP chunks twice Because scrub enumerates the dev extent tree to find the chunks to scrub, it currently finds each DUP chunk twice and also scrubs it twice. This patch makes sure that scrub_chunk only checks that part of the chunk the dev extent has been found for. This only changes the behaviour for DUP chunks. Reported-and-tested-by: Stefan Behrens Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9770cc5bfb7..abc0fbffa51 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1367,7 +1367,8 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) + u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = &sdev->dev->dev_root->fs_info->mapping_tree; @@ -1391,7 +1392,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev) { + if (map->stripes[i].dev == sdev->dev && + map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sdev, map, i, chunk_offset, length); if (ret) goto out; @@ -1487,7 +1489,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, - chunk_offset, length); + chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) break; -- cgit v1.2.3 From a7e221e9002306a753d6f78b4060edabce402033 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 14 Feb 2012 17:12:23 +0900 Subject: Btrfs: fix memory leak in load_free_space_cache() load_free_space_cache() has forgotten to free path. Signed-off-by: Tsutomu Itoh --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5802b1473c3..b30242f4435 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -777,6 +777,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); + btrfs_free_path(path); goto out; } spin_unlock(&block_group->lock); -- cgit v1.2.3 From 87826df0ec36fc28884b4ddbb3f3af41c4c2008f Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 15 Feb 2012 16:23:57 +0100 Subject: btrfs: delalloc for page dirtied out-of-band in fixup worker We encountered an issue that was easily observable on s/390 systems but could really happen anywhere. The timing just seemed to hit reliably on s/390 with limited memory. The gist is that when an unexpected set_page_dirty() happened, we'd run into the BUG() in btrfs_writepage_fixup_worker since it wasn't properly set up for delalloc. This patch does the following: - Performs the missing delalloc in the fixup worker - Allow the start hook to return -EBUSY which informs __extent_writepage that it should mark the page skipped and not to redirty it. This is required since the fixup worker can fail with -ENOSPC and the page will have already been redirtied. That causes an Oops in drop_outstanding_extents later. Retrying the fixup worker could lead to an infinite loop. Deferring the page redirty also saves us some cycles since the page would be stuck in a resubmit-redirty loop until the fixup worker completes. It's not harmful, just wasteful. - If the fixup worker fails, we mark the page and mapping as errored, and end the writeback, similar to what we would do had the page actually been submitted to writeback. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 65 ++++++++++++++++++++++++++++++++-------------------- fs/btrfs/extent_io.h | 1 + fs/btrfs/inode.c | 14 +++++++++-- 3 files changed, 53 insertions(+), 27 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fcf77e1ded4..89ba79fb945 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2161,6 +2161,38 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(NULL, page, + start, end, NULL); + /* Writeback already completed */ + if (ret == 0) + return 1; + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -2172,13 +2204,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; u64 start; u64 end; int whole_page; - int ret; do { struct page *page = bvec->bv_page; @@ -2195,28 +2225,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } - - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; if (whole_page) end_page_writeback(page); @@ -2818,8 +2829,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc6a042cb6f..cecc3518c12 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -319,4 +319,5 @@ struct btrfs_mapping_tree; int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7405753ec5d..bf392e53261 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1555,6 +1555,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; + int ret; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; @@ -1582,12 +1583,21 @@ again: page_end, &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); goto again; } - BUG(); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); + set_page_dirty(page); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -1630,7 +1640,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) fixup->work.func = btrfs_writepage_fixup_worker; fixup->page = page; btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); - return -EAGAIN; + return -EBUSY; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From c08782dacd7a098f2b8bca7f4a57a5b402e9e1e5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: btrfs: fix structs where bitfields and spinlock/atomic share 8B word On ia64, powerpc64 and sparc64 the bitfield is modified through a RMW cycle and current gcc rewrites the adjacent 4B word, which in case of a spinlock or atomic has disaterous effect. https://lkml.org/lkml/2012/2/1/220 Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_map.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3c2cbf7b666..8e4457e0478 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -886,7 +886,7 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full:1; + unsigned int full; }; /* diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 33a7890b1f4..1195f09761f 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,8 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree:1; - unsigned int compress_type:4; + unsigned int in_tree; + unsigned int compress_type; }; struct extent_map_tree { -- cgit v1.2.3 From 8a3344269465b26761b74493cfbeeaa75d821614 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Oct 2011 18:06:13 +0200 Subject: btrfs: silence warning in raid array setup Raid array setup code creates an extent buffer in an usual way. When the PAGE_CACHE_SIZE is > super block size, the extent pages are not marked up-to-date, which triggers a WARN_ON in the following write_extent_buffer call. Add an explicit up-to-date call to silence the warning. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7ffdb154dae..d8f282b5baa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4357,6 +4357,20 @@ int btrfs_read_sys_array(struct btrfs_root *root) return -ENOMEM; btrfs_set_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->first_page); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); -- cgit v1.2.3 From 12fc9d0923ca70ae8960bccebac09d5c12f8c4d4 Mon Sep 17 00:00:00 2001 From: Florian Albrechtskirchinger Date: Fri, 10 Feb 2012 22:15:54 +0100 Subject: btrfs: honor umask when creating subvol root Set the subvol root inode permissions based on the current umask. --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf392e53261..6e0ee9b0d74 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6706,8 +6706,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int err; u64 index = 0; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, S_IFDIR | 0700, &index); + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, + new_dirid, new_dirid, + S_IFDIR | (~current_umask() & S_IRWXUGO), + &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; -- cgit v1.2.3 From 013bd4c336ad0d30e9e41f9cff0dbc1858934e75 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 10:11:40 +0900 Subject: Btrfs: fix return value check of extent_io_ops This patch adds the check on the return value of extent_io_ops. Signed-off-by: Tsutomu Itoh --- fs/btrfs/extent_io.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89ba79fb945..b05d35a7c0f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2154,9 +2154,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, failrec->this_mirror, num_copies, failrec->in_validation); - tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, - failrec->bio_flags, 0); - return 0; + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2790,9 +2791,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + &nr_written); + BUG_ON(ret); /* * delalloc_end is already one less than the total * length, so we don't subtract one from -- cgit v1.2.3 From 600a45e1d5e376f679ff9ecc4ce9452710a6d27c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 16 Feb 2012 15:01:24 +0800 Subject: Btrfs: fix deadlock on page lock when doing auto-defragment When I ran xfstests circularly on a auto-defragment btrfs, the deadlock happened. Steps to reproduce: [tty0] # export MOUNT_OPTIONS="-o autodefrag" # export TEST_DEV= # export TEST_DIR= # export SCRATCH_DEV= # export SCRATCH_MNT= # while [ 1 ] > do > ./check 091 127 263 > sleep 1 > done [tty1] # while [ 1 ] > do > echo 3 > /proc/sys/vm/drop_caches > done Several hours later, the test processes will hang on, and the deadlock will happen on page lock. The reason is that: Auto defrag task Flush thread Test task btrfs_writepages() add ordered extent (including page 1, 2) set page 1 writeback set page 2 writeback endio_fn() end page 2 writeback release page 2 lock page 1 alloc and lock page 2 page 2 is not uptodate btrfs_readpage() start ordered extent() btrfs_writepages() try to lock page 1 so deadlock happens. Fix this bug by unlocking the page which is in writeback, and re-locking it after the writeback end. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b06a5ca8af..e9bdb8b783e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -862,6 +862,7 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) @@ -872,18 +873,34 @@ static int cluster_pages_for_defrag(struct inode *inode, num_pages << PAGE_CACHE_SHIFT); if (ret) return ret; -again: - ret = 0; i_done = 0; + tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; +again: page = find_or_create_page(inode->i_mapping, - start_index + i, mask); + start_index + i, mask); if (!page) break; + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + while (1) { + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, + page_start); + unlock_extent(tree, page_start, page_end, GFP_NOFS); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -894,15 +911,22 @@ again: break; } } + isize = i_size_read(inode); file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end || - page->mapping != inode->i_mapping) { + if (!isize || page->index > file_end) { /* whoops, we blew past eof, skip this page */ unlock_page(page); page_cache_release(page); break; } + + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + pages[i] = page; i_done++; } @@ -925,25 +949,6 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); - if (ordered && - ordered->file_offset + ordered->len > page_start && - ordered->file_offset < page_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, - &cached_state, GFP_NOFS); - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - btrfs_wait_ordered_range(inode, page_start, - page_end - page_start); - goto again; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, -- cgit v1.2.3 From 285190d99fef696ec8b0041787387f013cb71d67 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 16:23:58 +0900 Subject: Btrfs: check return value of lookup_extent_mapping() correctly This patch corrects error checking of lookup_extent_mapping(). Signed-off-by: Tsutomu Itoh --- fs/btrfs/compression.c | 2 ++ fs/btrfs/extent_io.c | 2 +- fs/btrfs/volumes.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d..d02c27cd14c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b05d35a7c0f..8d6f55fbd28 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3308,7 +3308,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8f282b5baa..cd040bf3fd8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1954,7 +1954,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || + BUG_ON(!em || em->start > chunk_offset || em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; -- cgit v1.2.3 From 0449314a9cc5a7fb0bd42e2175a3c46f68f3a2b0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:37 +0800 Subject: Btrfs: skip states when they does not contain bits to clear Clearing a range's bits is different with setting them, since we don't need to touch them when states do not contain bits we want. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8d6f55fbd28..fe14285b53f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -513,6 +513,15 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + if (state->end < end && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) + goto next; + /* * | ---- desired range ---- | * | state | or @@ -565,12 +574,8 @@ hit_next: goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - set |= clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; -- cgit v1.2.3 From 9d47c7671dc555e198c7347a173ed37316e0c4c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:38 +0800 Subject: Btrfs: kick out redundant stuff in convert_extent_bit clear_state_bit will do merge_state for us, so kick out the redundant one. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe14285b53f..37259ff5cd7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -966,8 +966,6 @@ hit_next: set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - - merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -1012,7 +1010,6 @@ hit_next: if (state->end <= end) { set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1073,8 +1070,6 @@ hit_next: set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); - - merge_state(tree, prealloc); prealloc = NULL; goto out; } -- cgit v1.2.3 From d9b0218f6cb682aa6a4ada2bfc5a25fdf3018563 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:39 +0800 Subject: Btrfs: fix a bug on overcommit stuff When overcommitting, we should check the sum of pinned space and bytes for delayed item. Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60bfe2d6854..dc083f55bcf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3611,12 +3611,15 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; + spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size < bytes) { + if (space_info->bytes_pinned + delayed_rsv->size < bytes) { spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); -- cgit v1.2.3 From 692e5759a43b916f0b66bcb39b2957499992381e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:36 +0800 Subject: Btrfs: be less strict on finding next node in clear_extent_bit In clear_extent_bit, it is enough that next node is adjacent in tree level. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 37259ff5cd7..45ca8f9b0d6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -582,8 +582,7 @@ next: if (start <= end && next_node) { state = rb_entry(next_node, struct extent_state, rb_node); - if (state->start == start) - goto hit_next; + goto hit_next; } goto search_again; -- cgit v1.2.3 From fe66a05a06795bd3b788404d69ea7709f46a1609 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 08:40:56 -0500 Subject: Btrfs: improve error handling for btrfs_insert_dir_item callers This allows us to gracefully continue if we aren't able to insert directory items, both for normal files/dirs and snapshots. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 20 +++++++++++++++++++- fs/btrfs/transaction.c | 13 +++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6e0ee9b0d74..cbeb2e36ceb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4585,7 +4585,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - BUG_ON(ret); + if (ret) + goto fail_dir_item; btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); @@ -4593,6 +4594,23 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, parent_inode); } return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, &local_index, name, name_len); + + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, name_len, + ino, parent_ino, &local_index); + } + return ret; } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 287a6728b1a..016977beee5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -915,7 +915,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, BTRFS_FT_DIR, index); - BUG_ON(ret); + if (ret) { + pending->error = -EEXIST; + dput(parent); + goto fail; + } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); @@ -993,12 +997,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, { struct btrfs_pending_snapshot *pending; struct list_head *head = &trans->transaction->pending_snapshots; - int ret; - list_for_each_entry(pending, head, list) { - ret = create_pending_snapshot(trans, fs_info, pending); - BUG_ON(ret); - } + list_for_each_entry(pending, head, list) + create_pending_snapshot(trans, fs_info, pending); return 0; } -- cgit v1.2.3 From a6b0d5c8dbfd428717fc4db4c36757783f391c7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 20:53:43 -0500 Subject: Btrfs: make sure we update latest_bdev When we are setting up the mount, we close all the devices that were not actually part of the metadata we found. But, we don't make sure that one of those devices wasn't fs_devices->latest_bdev, which means we can do a use after free on the one we closed. This updates latest_bdev as it goes. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/volumes.c | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 58d0678dfcb..b801d29f3f1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2305,6 +2305,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices); + if (!fs_devices->latest_bdev) { + printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } + retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd040bf3fd8..7eefdef8a14 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -459,12 +459,23 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; + struct block_device *latest_bdev = NULL; + u64 latest_devid = 0; + u64 latest_transid = 0; + mutex_lock(&uuid_mutex); again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) + if (device->in_fs_metadata) { + if (!latest_transid || + device->generation > latest_transid) { + latest_devid = device->devid; + latest_transid = device->generation; + latest_bdev = device->bdev; + } continue; + } if (device->bdev) { blkdev_put(device->bdev, device->mode); @@ -487,6 +498,10 @@ again: goto again; } + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + mutex_unlock(&uuid_mutex); return 0; } -- cgit v1.2.3 From 16780cabb877dbd0c8c5e9ff9bdebd6c5bdd1a7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 22:14:55 -0500 Subject: Btrfs: add extra sanity checks on the path names in btrfs_mksubvol Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e9bdb8b783e..05446f77f99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1333,6 +1333,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, goto out; } + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out; + } + if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, NULL, transid, readonly); -- cgit v1.2.3 From 506531905296d6aee84480c879b25ea98c3f9db6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 22 Feb 2012 12:36:24 -0500 Subject: Btrfs: clear the extent uptodate bits during parent transid failures If btrfs reads a block and finds a parent transid mismatch, it clears the uptodate flags on the extent buffer, and the pages inside it. But we only clear the uptodate bits in the state tree if the block straddles more than one page. This is from an old optimization from to reduce contention on the extent state tree. But it is buggy because the code that retries a read from a different copy of the block is going to find the uptodate state bits set and skip the IO. The end result of the bug is that we'll never actually read the good copy (if there is one). The fix here is to always clear the uptodate state bits, which is safe because this code is only called when the parent transid fails. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 45ca8f9b0d6..a55fbe6252d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3871,10 +3871,9 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - if (eb_straddles_pages(eb)) { - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - } + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) -- cgit v1.2.3 From 5500cdbe14d7435e04f66ff3cfb8ecd8b8e44ebf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Feb 2012 10:49:04 -0500 Subject: Btrfs: increase the global block reserve estimates When doing IO with large amounts of data fragmentation, the global block reserve calulations are too low. This increases them to avoid ENOSPC crashes. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc083f55bcf..079e5a1c343 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4108,7 +4108,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes += div64_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div64_u64(meta_used, 3) * 2; return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); } -- cgit v1.2.3 From e77266e4c4be6f9dc91bf688bce015a8babd5fe0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Feb 2012 10:39:05 -0500 Subject: Btrfs: fix compiler warnings on 32 bit systems The enospc tracing code added some interesting uses of u64 pointer casts. Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/extent-tree.c | 35 +++++++++++++++++++---------------- fs/btrfs/inode-map.c | 6 ++++-- fs/btrfs/transaction.c | 3 ++- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b669a7d8e49..d986824bb2b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -644,7 +644,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - int ret; + int ret = 0; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 079e5a1c343..37e0a800d34 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3312,7 +3312,8 @@ commit_trans: } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 1); + (u64)(unsigned long)data_sinfo, + bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3333,7 +3334,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 0); + (u64)(unsigned long)data_sinfo, + bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3698,9 +3700,9 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3769,9 +3771,9 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3916,8 +3918,8 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)space_info, - num_bytes, 0); + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4135,14 +4137,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 1); + (u64)(unsigned long)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 0); + (u64)(unsigned long)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4195,7 +4197,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)(unsigned long)trans, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; @@ -4713,9 +4716,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)space_info, - num_bytes, 0); + "space_info", + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 213ffa86ce1..ee15d88b33d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,7 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); @@ -500,7 +501,8 @@ again: out_put: iput(inode); out_release: - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 016977beee5..04b77e3ceb7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -327,7 +327,8 @@ again: if (num_bytes) { trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)h, num_bytes, 1); + (u64)(unsigned long)h, + num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } -- cgit v1.2.3 From d3b010640e5c59b98d3b11229ba4cc2838dc7cbf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 3 Mar 2012 07:41:15 -0500 Subject: btrfs: fix locking issues in find_parent_nodes() - We might unlock head->mutex while it was not locked - We might leave the function without unlocking delayed_refs->lock Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 98f6bf10bbd..0436c12da8c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -583,7 +583,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key info_key = { 0 }; struct btrfs_delayed_ref_root *delayed_refs = NULL; - struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; struct list_head prefs_delayed; @@ -607,6 +607,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, * at a specified point in time */ again: + head = NULL; + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -635,8 +637,10 @@ again: goto again; } ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); - if (ret) + if (ret) { + spin_unlock(&delayed_refs->lock); goto out; + } } spin_unlock(&delayed_refs->lock); -- cgit v1.2.3 From a175423c831ea582c06784d1e172d2ce1d79923a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 28 Feb 2012 12:42:44 -0500 Subject: Btrfs: fix casting error in scrub reada code The reada code from scrub was casting down a u64 to an unsigned long so it could insert it into a radix tree. What it really wanted to do was cast down the result of a shift, instead of casting down the u64. The bug resulted in trying to insert our reada struct into the wrong place, which caused soft lockups and other problems. Signed-off-by: Chris Mason --- fs/btrfs/reada.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 2373b39a132..22db04550f6 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -305,7 +305,7 @@ again: spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); spin_unlock(&fs_info->reada_lock); -- cgit v1.2.3 From 7ac687d9e047b3fa335f04e18c7188db6a170334 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:28 +0800 Subject: btrfs: remove the second argument of k[un]map_atomic() Signed-off-by: Cong Wang --- fs/btrfs/compression.c | 12 ++++++------ fs/btrfs/extent_io.c | 16 ++++++++-------- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/inode.c | 26 +++++++++++++------------- fs/btrfs/lzo.c | 4 ++-- fs/btrfs/scrub.c | 8 ++++---- fs/btrfs/zlib.c | 4 ++-- 7 files changed, 37 insertions(+), 37 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d02c27cd14c..b805afb37fa 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -120,10 +120,10 @@ static int check_compressed_csum(struct inode *inode, page = cb->compressed_pages[i]; csum = ~(u32)0; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (csum != *cb_sum) { printk(KERN_INFO "btrfs csum failed ino %llu " @@ -521,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (zero_offset) { int zeros; zeros = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, zeros); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } @@ -993,9 +993,9 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(PAGE_CACHE_SIZE - *pg_offset, PAGE_CACHE_SIZE - buf_offset); bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); + kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); flush_dcache_page(page_out); *pg_offset += bytes; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a55fbe6252d..2862454bcdb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2546,10 +2546,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (zero_offset) { iosize = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } while (cur <= end) { @@ -2558,10 +2558,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct extent_state *cached = NULL; iosize = PAGE_CACHE_SIZE - pg_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, @@ -2607,10 +2607,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree, char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -2756,10 +2756,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (page->index == end_index) { char *userpage; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); flush_dcache_page(page); } pg_offset = 0; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c7fb3a4247d..078b4fd5450 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -447,13 +447,13 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums->bytenr = ordered->start; } - data = kmap_atomic(bvec->bv_page, KM_USER0); + data = kmap_atomic(bvec->bv_page); sector_sum->sum = ~(u32)0; sector_sum->sum = btrfs_csum_data(root, data + bvec->bv_offset, sector_sum->sum, bvec->bv_len); - kunmap_atomic(data, KM_USER0); + kunmap_atomic(data); btrfs_csum_final(sector_sum->sum, (char *)§or_sum->sum); sector_sum->bytenr = disk_bytenr; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 892b34785cc..3a0b5c1f9d3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -173,9 +173,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_CACHE_SIZE); - kaddr = kmap_atomic(cpage, KM_USER0); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -187,10 +187,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); offset = start & (PAGE_CACHE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); page_cache_release(page); } btrfs_mark_buffer_dirty(leaf); @@ -422,10 +422,10 @@ again: * sending it down to disk */ if (offset) { - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } will_compress = 1; } @@ -1873,7 +1873,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, } else { ret = get_state_private(io_tree, start, &private); } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (ret) goto zeroit; @@ -1882,7 +1882,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (csum != private) goto zeroit; - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); good: return 0; @@ -1894,7 +1894,7 @@ zeroit: (unsigned long long)private); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (private == 0) return 0; return -EIO; @@ -4937,12 +4937,12 @@ static noinline int uncompress_inline(struct btrfs_path *path, ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); if (ret) { - char *kaddr = kmap_atomic(page, KM_USER0); + char *kaddr = kmap_atomic(page); unsigned long copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, max_size - extent_offset); memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } kfree(tmp); return 0; @@ -5719,11 +5719,11 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) unsigned long flags; local_irq_save(flags); - kaddr = kmap_atomic(page, KM_IRQ0); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, csum, bvec->bv_len); btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr, KM_IRQ0); + kunmap_atomic(kaddr); local_irq_restore(flags); flush_dcache_page(bvec->bv_page); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a178f5ebea7..743b86fa4fc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -411,9 +411,9 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = kmap_atomic(dest_page, KM_USER0); + kaddr = kmap_atomic(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); out: return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index abc0fbffa51..390e7102b0f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -591,7 +591,7 @@ static int scrub_fixup_check(struct scrub_bio *sbio, int ix) u64 flags = sbio->spag[ix].flags; page = sbio->bio->bi_io_vec[ix].bv_page; - buffer = kmap_atomic(page, KM_USER0); + buffer = kmap_atomic(page); if (flags & BTRFS_EXTENT_FLAG_DATA) { ret = scrub_checksum_data(sbio->sdev, sbio->spag + ix, buffer); @@ -603,7 +603,7 @@ static int scrub_fixup_check(struct scrub_bio *sbio, int ix) } else { WARN_ON(1); } - kunmap_atomic(buffer, KM_USER0); + kunmap_atomic(buffer); return ret; } @@ -792,7 +792,7 @@ static void scrub_checksum(struct btrfs_work *work) } for (i = 0; i < sbio->count; ++i) { page = sbio->bio->bi_io_vec[i].bv_page; - buffer = kmap_atomic(page, KM_USER0); + buffer = kmap_atomic(page); flags = sbio->spag[i].flags; logical = sbio->logical + i * PAGE_SIZE; ret = 0; @@ -807,7 +807,7 @@ static void scrub_checksum(struct btrfs_work *work) } else { WARN_ON(1); } - kunmap_atomic(buffer, KM_USER0); + kunmap_atomic(buffer); if (ret) { ret = scrub_recheck_error(sbio, i); if (!ret) { diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index faccd47c6c4..92c20654cc5 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -370,9 +370,9 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, PAGE_CACHE_SIZE - buf_offset); bytes = min(bytes, bytes_left); - kaddr = kmap_atomic(dest_page, KM_USER0); + kaddr = kmap_atomic(dest_page); memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); pg_offset += bytes; bytes_left -= bytes; -- cgit v1.2.3 From 48fde701aff662559b38d9a609574068f22d00fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 22:15:13 -0500 Subject: switch open-coded instances of d_make_root() to new helper Signed-off-by: Al Viro --- fs/btrfs/super.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3ce97b217cb..81df3fec6a6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -629,7 +629,6 @@ static int btrfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root_dentry; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_key key; int err; @@ -660,15 +659,12 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - root_dentry = d_alloc_root(inode); - if (!root_dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) { err = -ENOMEM; goto fail_close; } - sb->s_root = root_dentry; - save_mount_options(sb, data); cleancache_init_fs(sb); sb->s_flags |= MS_ACTIVE; -- cgit v1.2.3 From 8c3429300181be44b30f9f017d53dc717da56caa Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:31 -0400 Subject: btrfs: Add btrfs_panic() As part of the effort to eliminate BUG_ON as an error handling technique, we need to determine which errors are actual logic errors, which are on-disk corruption, and which are normal runtime errors e.g. -ENOMEM. Annotating these error cases is helpful to understand and report them. This patch adds a btrfs_panic() routine that will either panic or BUG depending on the new -ofatal_errors={panic,bug} mount option. Since there are still so many BUG_ONs, it defaults to BUG for now but I expect that to change once the error handling effort has made significant progress. Signed-off-by: Jeff Mahoney --- fs/btrfs/ctree.h | 11 +++++++++++ fs/btrfs/super.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80b6486fd5e..a97a6708975 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1503,6 +1503,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) +#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2970,6 +2971,16 @@ do { \ __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ } while (0) +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + struct btrfs_fs_info *_i = (fs_info); \ + __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ + BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ +} while (0) + /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3ce97b217cb..9774e38a053 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -76,6 +76,9 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, case -EROFS: errstr = "Readonly filesystem"; break; + case -EEXIST: + errstr = "Object already exists"; + break; default: if (nbuf) { if (snprintf(nbuf, 16, "error %d", -errno) >= 0) @@ -145,6 +148,36 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, btrfs_handle_error(fs_info); } +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, + * issues an alert, and either panics or BUGs, depending on mount options. + */ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char nbuf[16]; + char *s_id = ""; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", + s_id, function, line, &vaf, errstr); + + printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", + s_id, function, line, &vaf, errstr); + va_end(args); + /* Caller calls BUG() */ +} + static void btrfs_put_super(struct super_block *sb) { (void)close_ctree(btrfs_sb(sb)->tree_root); @@ -166,7 +199,7 @@ enum { Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_err, }; @@ -206,6 +239,7 @@ static match_table_t tokens = { {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_err, NULL}, }; @@ -438,6 +472,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) ret = -EINVAL; goto out; #endif + case Opt_fatal_errors: + if (strcmp(args[0].from, "panic") == 0) + btrfs_set_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else if (strcmp(args[0].from, "bug") == 0) + btrfs_clear_opt(info->mount_opt, + PANIC_ON_FATAL_ERROR); + else { + ret = -EINVAL; + goto out; + } + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -766,6 +812,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(root, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + seq_puts(seq, ",fatal_errors=panic"); return 0; } -- cgit v1.2.3 From c2d904e086b6f707b73bf065e4d18ded4b86ae9e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:32 -0400 Subject: btrfs: Catch locking failures in {set,clear,convert}_extent_bit The *_state functions can only return 0 or -EEXIST. This patch addresses the cases where those functions returning -EEXIST represent a locking failure. It handles them by panicking with an appropriate error message. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 58 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a55fbe6252d..601f23bddea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -53,6 +53,12 @@ struct extent_page_data { unsigned int sync_io:1; }; +static inline struct btrfs_fs_info * +tree_fs_info(struct extent_io_tree *tree) +{ + return btrfs_sb(tree->mapping->host->i_sb); +} + int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("extent_state", @@ -439,6 +445,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } +void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree_fs_info(tree), err, "Locking error: " + "Extent tree was modified by another " + "thread while locked."); +} + /* * clear some bits on a range in the tree. This may require splitting * or inserting elements in the tree, so the gfp mask is used to @@ -542,7 +555,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; @@ -564,7 +579,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + if (wake) wake_up(&state->wq); @@ -742,8 +759,10 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, &bits); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; - BUG_ON(err == -EEXIST); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -809,7 +828,9 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; if (err) goto out; @@ -846,12 +867,9 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; @@ -873,7 +891,8 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); cache_state(prealloc, cached_state); @@ -946,7 +965,8 @@ again: } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); goto out; } state = rb_entry(node, struct extent_state, rb_node); @@ -1002,7 +1022,8 @@ hit_next: goto out; } err = split_state(tree, state, prealloc, start); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; if (err) goto out; @@ -1041,12 +1062,8 @@ hit_next: */ err = insert_state(tree, prealloc, start, this_end, &bits); - BUG_ON(err == -EEXIST); - if (err) { - free_extent_state(prealloc); - prealloc = NULL; - goto out; - } + if (err) + extent_io_tree_panic(tree, err); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1065,7 +1082,8 @@ hit_next: } err = split_state(tree, state, prealloc, end + 1); - BUG_ON(err == -EEXIST); + if (err) + extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); -- cgit v1.2.3 From 43c04fb1b8c9f45d971bb53d7cbbcda8ee85716b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:33 -0400 Subject: btrfs: Panic on bad rbtree operations The ordered data and relocation trees have BUG_ONs to protect against bad tree operations. This patch replaces them with a panic that will report the problem. Signed-off-by: Jeff Mahoney --- fs/btrfs/ordered-data.c | 12 ++++++++++-- fs/btrfs/relocation.c | 36 +++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a1c94042530..2857f28e20e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -59,6 +59,14 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +static void ordered_data_tree_panic(struct inode *inode, int errno, + u64 offset) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " + "%llu\n", (unsigned long long)offset); +} + /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -207,7 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); - BUG_ON(node); + if (node) + ordered_data_tree_panic(inode, -EEXIST, file_offset); spin_unlock(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); @@ -215,7 +224,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &BTRFS_I(inode)->root->fs_info->ordered_extents); spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); - BUG_ON(node); return 0; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8c1aae2c845..e5996ff8aaa 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -326,6 +326,19 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) return NULL; } +void backref_tree_panic(struct rb_node *rb_node, int errno, + u64 bytenr) +{ + + struct btrfs_fs_info *fs_info = NULL; + struct backref_node *bnode = rb_entry(rb_node, struct backref_node, + rb_node); + if (bnode->root) + fs_info = bnode->root->fs_info; + btrfs_panic(fs_info, errno, "Inconsistency in backref cache " + "found at offset %llu\n", (unsigned long long)bytenr); +} + /* * walk up backref nodes until reach node presents tree root */ @@ -452,7 +465,8 @@ static void update_backref_node(struct backref_cache *cache, rb_erase(&node->rb_node, &cache->rb_root); node->bytenr = bytenr; rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, bytenr); } /* @@ -999,7 +1013,8 @@ next: if (!cowonly) { rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); list_add_tail(&node->lower, &cache->leaves); } @@ -1034,7 +1049,9 @@ next: if (!cowonly) { rb_node = tree_insert(&cache->rb_root, upper->bytenr, &upper->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + upper->bytenr); } list_add_tail(&edge->list[UPPER], &upper->lower); @@ -1180,7 +1197,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, rb_node = tree_insert(&cache->rb_root, new_node->bytenr, &new_node->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); if (!new_node->lowest) { list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { @@ -1252,7 +1270,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); } else { list_del_init(&root->root_list); kfree(node); @@ -3154,7 +3173,8 @@ static int add_tree_block(struct reloc_control *rc, block->key_ready = 0; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, block->bytenr); return 0; } @@ -3426,7 +3446,9 @@ static int find_data_references(struct reloc_control *rc, block->key_ready = 1; rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, + block->bytenr); } if (counted) added = 1; -- cgit v1.2.3 From cddcd800189bc03745d576f913dc57692c6f439a Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:23:23 -0400 Subject: btrfs: Fix kfree of member instead of structure Correctness fix: The kfree calls in the add_delayed_* functions free the node that's passed into it, but the node is a member of another structure. It works because it's always the first member of the containing structure, but it should really be using the containing structure itself. Signed-off-by: Jeff Mahoney --- fs/btrfs/delayed-ref.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 66e4f29505a..29ecd543d80 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -487,7 +487,7 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -549,7 +549,7 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -611,7 +611,7 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(ref); + kfree(full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; -- cgit v1.2.3 From d16cb050e5b1c3a9d754fed7098eefb8237877d1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:34 -0400 Subject: btrfs: Simplify btrfs_insert_root btrfs_insert_root is just a wrapper for btrfs_insert_item. Just return the error directly. Signed-off-by: Jeff Mahoney --- fs/btrfs/root-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index f4099904565..1fd93d63707 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -116,13 +116,10 @@ out: return ret; } -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item) +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item) { - int ret; - ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); - return ret; + return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } /* -- cgit v1.2.3 From d5c13f927fe77b11a67f79559808c68b26474c77 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:56:27 +0100 Subject: btrfs: clean_tree_block should panic on observed memory corruption and return void The only error condition in clean_tree_block is an accounting bug. Returning without modifying dirty_metadata_bytes and as if the cleaning as been performed may cause problems later so it should panic instead. It should probably be a BUG_ON but we have btrfs_panic now. Signed-off-by: Jeff Mahoney --- fs/btrfs/disk-io.c | 15 ++++++++++----- fs/btrfs/disk-io.h | 4 ++-- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 534266fe505..d52ec115520 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1109,8 +1109,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, } -int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf) +void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) { struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == @@ -1121,8 +1121,14 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, spin_lock(&root->fs_info->delalloc_lock); if (root->fs_info->dirty_metadata_bytes >= buf->len) root->fs_info->dirty_metadata_bytes -= buf->len; - else - WARN_ON(1); + else { + spin_unlock(&root->fs_info->delalloc_lock); + btrfs_panic(root->fs_info, -EOVERFLOW, + "Can't clear %lu bytes from " + " dirty_mdatadata_bytes (%lu)", + buf->len, + root->fs_info->dirty_metadata_bytes); + } spin_unlock(&root->fs_info->delalloc_lock); } @@ -1131,7 +1137,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } - return 0; } static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e4bc4741319..ca34237ae38 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -44,8 +44,8 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -- cgit v1.2.3 From 538042801a479ebd316582ad10a9c3156b5b7548 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:56:28 +0100 Subject: btrfs: avoid NULL deref in btrfs_reserve_extent with DEBUG_ENOSPC __find_space_info can return NULL but we don't check it before calling dump_space_info(). Signed-off-by: Jeff Mahoney --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 37e0a800d34..c32a9bffe9e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5838,7 +5838,8 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + if (sinfo) + dump_space_info(sinfo, num_bytes, 1); } } -- cgit v1.2.3 From 6763af84a69f23c1c79f00720982e74fcff4d1f3 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:56:29 +0100 Subject: btrfs: Remove set bits return from clear_extent_bit There is only one caller of clear_extent_bit that checks the return value and it only checks if it's negative. Since there are no users of the returned bits functionality of clear_extent_bit, stop returning it and avoid complicating error handling. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 601f23bddea..a4a7a18cdd3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -462,8 +462,7 @@ void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * the range [start, end] is inclusive. * - * This takes the tree lock, and returns < 0 on error, > 0 if any of the - * bits were already set, or zero if none of the bits were already set. + * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int wake, int delete, @@ -477,7 +476,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct rb_node *node; u64 last_end; int err; - int set = 0; int clear = 0; if (delete) @@ -562,7 +560,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -585,13 +583,13 @@ hit_next: if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; } - set |= clear_state_bit(tree, state, &bits, wake); + clear_state_bit(tree, state, &bits, wake); next: if (last_end == (u64)-1) goto out; @@ -608,7 +606,7 @@ out: if (prealloc) free_extent_state(prealloc); - return set; + return 0; search_again: if (start > end) -- cgit v1.2.3 From 200a5c17677e1ee8b78382046f3748c9f5816281 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:43 -0400 Subject: btrfs: find_and_setup_root error push-up find_and_setup_root BUGs when it encounters an error from btrfs_find_last_root, which can occur if a path can't be allocated. This patch pushes it up to its callers where it is already handled. Signed-off-by: Jeff Mahoney --- fs/btrfs/disk-io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d52ec115520..155c4e35e53 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1197,10 +1197,10 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, return 0; } -static int find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) +static int __must_check find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) { int ret; u32 blocksize; @@ -1213,7 +1213,8 @@ static int find_and_setup_root(struct btrfs_root *tree_root, &root->root_item, &root->root_key); if (ret > 0) return -ENOENT; - BUG_ON(ret); + else if (ret < 0) + return ret; generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); -- cgit v1.2.3 From b45a9d8b48e5ce534bd222007c43cbf374544f0b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:44 -0400 Subject: btrfs: btrfs_update_root error push-up btrfs_update_root BUG's when it can't alloc a path, yet it can recover from a search error. This patch returns -ENOMEM instead. Signed-off-by: Jeff Mahoney --- fs/btrfs/ctree.h | 7 ++++--- fs/btrfs/root-tree.c | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a97a6708975..339e637ab27 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2724,9 +2724,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 1fd93d63707..1486cf9de1d 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -93,7 +93,9 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root unsigned long ptr; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) goto out; -- cgit v1.2.3 From 0417341e6bd93e2a2ceac0e57409706803b335e5 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:23:12 -0400 Subject: btrfs: Simplify btrfs_submit_bio_hook btrfs_submit_bio_hook currently calls btrfs_bio_wq_end_io in either case of an if statement that determines one of the arguments. This patch moves the function call outside of the if statement and uses it to only determine the different argument. This allows us to catch an error in one place in a more visually obvious way. Signed-off-by: Jeff Mahoney --- fs/btrfs/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 892b34785cc..f0dfd7de020 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1479,13 +1479,14 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; int skip_sum; + int metadata = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(root, inode)) - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); - else - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + metadata = 2; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); BUG_ON(ret); if (!(rw & REQ_WRITE)) { -- cgit v1.2.3 From 3444a97255de907f32562741fb6d104620b9fce3 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:23:13 -0400 Subject: btrfs: Factor out tree->ops->merge_bio_hook call In submit_extent_page, there's a visually noisy if statement that, in the midst of other conditions, does the tree dependency for tree->ops and tree->ops->merge_bio_hook before calling it, and then another condition afterwards. If an error is returned from merge_bio_hook, there's no way to catch it. It's considered a routine "1" return value instead of a failure. This patch factors out the dependency check into a new local merge_bio routine and BUG's on an error. The if statement is less noisy as a side- effect. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 17 ++++++++++++++--- fs/btrfs/inode.c | 5 +++-- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a4a7a18cdd3..c342e923ea4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2429,6 +2429,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, return ret; } +static int merge_bio(struct extent_io_tree *tree, struct page *page, + unsigned long offset, size_t size, struct bio *bio, + unsigned long bio_flags) +{ + int ret = 0; + if (tree->ops && tree->ops->merge_bio_hook) + ret = tree->ops->merge_bio_hook(page, offset, size, bio, + bio_flags); + BUG_ON(ret < 0); + return ret; + +} + static int submit_extent_page(int rw, struct extent_io_tree *tree, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2457,9 +2470,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, page_size, bio, - bio_flags)) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0dfd7de020..ad1e88d2503 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1425,10 +1425,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_length = length; ret = btrfs_map_block(map_tree, READ, logical, &map_length, NULL, 0); - + /* Will always return 0 or 1 with map_multi == NULL */ + BUG_ON(ret < 0); if (map_length < length + size) return 1; - return ret; + return 0; } /* -- cgit v1.2.3 From 355808c296c6923db6705f43639969a80b16d15d Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:23:14 -0400 Subject: btrfs: ->submit_bio_hook error push-up This pushes failures from the submit_bio_hook callbacks, btrfs_submit_bio_hook and btree_submit_bio_hook into the callers, including callers of submit_one_bio where it catches the failures with BUG_ON. It also pushes up through the ->readpage_io_failed_hook to end_bio_extent_writepage where the error is already caught with BUG_ON. Signed-off-by: Jeff Mahoney --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/extent_io.c | 37 ++++++++++++++++++++++++++----------- fs/btrfs/inode.c | 3 ++- 3 files changed, 31 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 155c4e35e53..2e4428bd60e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -847,9 +847,9 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, { int ret; - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - BUG_ON(ret); + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1); + if (ret) + return ret; if (!(rw & REQ_WRITE)) { /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c342e923ea4..8368baa1f37 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2199,6 +2199,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) /* Writeback already completed */ if (ret == 0) return 1; + BUG_ON(ret < 0); } if (!uptodate) { @@ -2351,6 +2352,7 @@ error_handled: if (ret == 0) goto error_handled; } + BUG_ON(ret < 0); } if (uptodate) { @@ -2402,8 +2404,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) +static int __must_check submit_one_bio(int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2474,6 +2476,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); + BUG_ON(ret < 0); bio = NULL; } else { return 0; @@ -2494,8 +2497,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) *bio_ret = bio; - else + else { ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + BUG_ON(ret < 0); + } return ret; } @@ -2707,8 +2712,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); - if (bio) + if (bio) { ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + BUG_ON(ret < 0); + } return ret; } @@ -3126,10 +3133,14 @@ retry: static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { + int rw = WRITE; + int ret; + if (epd->sync_io) - submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); - else - submit_one_bio(WRITE, epd->bio, 0, 0); + rw = WRITE_SYNC; + + ret = submit_one_bio(rw, epd->bio, 0, 0); + BUG_ON(ret < 0); epd->bio = NULL; } } @@ -3245,8 +3256,10 @@ int extent_readpages(struct extent_io_tree *tree, page_cache_release(page); } BUG_ON(!list_empty(pages)); - if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + if (bio) { + int ret = submit_one_bio(READ, bio, 0, bio_flags); + BUG_ON(ret < 0); + } return 0; } @@ -4075,8 +4088,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } } - if (bio) - submit_one_bio(READ, bio, mirror_num, bio_flags); + if (bio) { + err = submit_one_bio(READ, bio, mirror_num, bio_flags); + BUG_ON(err < 0); + } if (ret || wait != WAIT_COMPLETE) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ad1e88d2503..5394bdb314c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1488,7 +1488,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, metadata = 2; ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); - BUG_ON(ret); + if (ret) + return ret; if (!(rw & REQ_WRITE)) { if (bio_flags & EXTENT_BIO_COMPRESSED) { -- cgit v1.2.3 From ffd7b33944f4573a063af7a55f8a5199c8185665 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:23:15 -0400 Subject: btrfs: __add_reloc_root error push-up This patch pushes kmalloc errors up to the caller and BUGs in the caller. The BUG_ON for duplicate reloc tree root insertion is replaced with a panic explaining the issue. Signed-off-by: Jeff Mahoney --- fs/btrfs/relocation.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e5996ff8aaa..974b0dfeffa 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1221,14 +1221,15 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __add_reloc_root(struct btrfs_root *root) +static int __must_check __add_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node; struct reloc_control *rc = root->fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); - BUG_ON(!node); + if (!node) + return -ENOMEM; node->bytenr = root->node->start; node->data = root; @@ -1237,7 +1238,12 @@ static int __add_reloc_root(struct btrfs_root *root) rb_node = tree_insert(&rc->reloc_root_tree.rb_root, node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); - BUG_ON(rb_node); + if (rb_node) { + kfree(node); + btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " + "for start=%llu while inserting into relocation " + "tree\n"); + } list_add_tail(&root->root_list, &rc->reloc_roots); return 0; @@ -1353,6 +1359,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *reloc_root; struct reloc_control *rc = root->fs_info->reloc_ctl; int clear_rsv = 0; + int ret; if (root->reloc_root) { reloc_root = root->reloc_root; @@ -1372,7 +1379,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, if (clear_rsv) trans->block_rsv = NULL; - __add_reloc_root(reloc_root); + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); root->reloc_root = reloc_root; return 0; } @@ -4226,7 +4234,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) reloc_root->root_key.offset); BUG_ON(IS_ERR(fs_root)); - __add_reloc_root(reloc_root); + err = __add_reloc_root(reloc_root); + BUG_ON(err < 0); fs_root->reloc_root = reloc_root; } @@ -4428,7 +4437,8 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, reloc_root = create_reloc_root(trans, root->reloc_root, new_root->root_key.objectid); - __add_reloc_root(reloc_root); + ret = __add_reloc_root(reloc_root); + BUG_ON(ret < 0); new_root->reloc_root = reloc_root; if (rc->create_reloc_tree) { -- cgit v1.2.3 From 143bede527b054a271053f41bfaca2b57baa9408 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:56:26 +0100 Subject: btrfs: return void in functions without error conditions Signed-off-by: Jeff Mahoney --- fs/btrfs/async-thread.c | 15 ++-- fs/btrfs/async-thread.h | 4 +- fs/btrfs/compression.c | 8 +- fs/btrfs/compression.h | 2 +- fs/btrfs/ctree.c | 200 ++++++++++++++++------------------------------- fs/btrfs/ctree.h | 45 +++++------ fs/btrfs/delayed-inode.c | 6 +- fs/btrfs/delayed-ref.c | 27 ++----- fs/btrfs/dir-item.c | 9 +-- fs/btrfs/disk-io.c | 39 ++++----- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent-tree.c | 60 +++++++------- fs/btrfs/extent_io.c | 35 ++++----- fs/btrfs/extent_io.h | 4 +- fs/btrfs/file-item.c | 25 +++--- fs/btrfs/inode-item.c | 4 +- fs/btrfs/inode.c | 7 +- fs/btrfs/locking.c | 6 +- fs/btrfs/locking.h | 4 +- fs/btrfs/ordered-data.c | 48 ++++-------- fs/btrfs/ordered-data.h | 24 +++--- fs/btrfs/scrub.c | 36 +++------ fs/btrfs/super.c | 5 +- fs/btrfs/transaction.c | 3 +- fs/btrfs/tree-log.c | 17 ++-- fs/btrfs/tree-log.h | 2 +- fs/btrfs/volumes.c | 18 ++--- fs/btrfs/volumes.h | 4 +- 28 files changed, 249 insertions(+), 410 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 0cc20b35c1c..42704149b72 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -171,11 +171,11 @@ out: spin_unlock_irqrestore(&workers->lock, flags); } -static noinline int run_ordered_completions(struct btrfs_workers *workers, +static noinline void run_ordered_completions(struct btrfs_workers *workers, struct btrfs_work *work) { if (!workers->ordered) - return 0; + return; set_bit(WORK_DONE_BIT, &work->flags); @@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, } spin_unlock(&workers->order_lock); - return 0; } static void put_worker(struct btrfs_worker_thread *worker) @@ -399,7 +398,7 @@ again: /* * this will wait for all the worker threads to shutdown */ -int btrfs_stop_workers(struct btrfs_workers *workers) +void btrfs_stop_workers(struct btrfs_workers *workers) { struct list_head *cur; struct btrfs_worker_thread *worker; @@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers) put_worker(worker); } spin_unlock_irq(&workers->lock); - return 0; } /* @@ -615,14 +613,14 @@ found: * it was taken from. It is intended for use with long running work functions * that make some progress and want to give the cpu up for others. */ -int btrfs_requeue_work(struct btrfs_work *work) +void btrfs_requeue_work(struct btrfs_work *work) { struct btrfs_worker_thread *worker = work->worker; unsigned long flags; int wake = 0; if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; spin_lock_irqsave(&worker->lock, flags); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) @@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); -out: - - return 0; } void btrfs_set_work_high_prio(struct btrfs_work *work) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index f34cc31fa3c..063698b90ce 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -111,9 +111,9 @@ struct btrfs_workers { void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers); -int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); -int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d02c27cd14c..0ed1ed22775 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -226,8 +226,8 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline int end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; @@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start, index += ret; } /* the inode may be gone now */ - return 0; } /* @@ -734,7 +733,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = { &btrfs_lzo_compress, }; -int __init btrfs_init_compress(void) +void __init btrfs_init_compress(void) { int i; @@ -744,7 +743,6 @@ int __init btrfs_init_compress(void) atomic_set(&comp_alloc_workspace[i], 0); init_waitqueue_head(&comp_workspace_wait[i]); } - return 0; } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index a12059f4f0f..9afb0a62ae8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_init_compress(void); +void btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(int type, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0639a555e16..9d472c2a8de 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -36,7 +36,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); struct btrfs_path *btrfs_alloc_path(void) @@ -1010,10 +1010,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - wret = del_ptr(trans, root, path, level + 1, pslot + - 1); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); free_extent_buffer(right); @@ -1051,9 +1048,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - wret = del_ptr(trans, root, path, level + 1, pslot); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); free_extent_buffer(mid); @@ -1881,15 +1876,12 @@ done: * fixing up pointers when a given leaf/node is not in slot 0 of the * higher levels * - * If this fails to write a tree block, it returns -1, but continues - * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) { int i; - int ret = 0; struct extent_buffer *t; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1902,7 +1894,6 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, if (tslot != 0) break; } - return ret; } /* @@ -1911,9 +1902,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; struct extent_buffer *eb; @@ -1923,13 +1914,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (comp_keys(&disk_key, new_key) >= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) >= 0); } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (comp_keys(&disk_key, new_key) <= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) <= 0); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -1937,7 +1926,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(eb); if (slot == 0) fixup_low_keys(trans, root, path, &disk_key, 1); - return 0; } /* @@ -2140,12 +2128,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. - * - * returns zero on success and < 0 on any error */ -static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_disk_key - *key, u64 bytenr, int slot, int level) +static void insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -2155,8 +2142,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) - BUG(); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), @@ -2169,7 +2155,6 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); - return 0; } /* @@ -2190,7 +2175,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; int mid; int ret; - int wret; u32 c_nritems; c = path->nodes[level]; @@ -2247,11 +2231,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - wret = insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, - level + 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2537,7 +2518,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 old_left_nritems; u32 nr; int ret = 0; - int wret; u32 this_item_size; u32 old_left_item_size; @@ -2643,9 +2623,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -2738,21 +2716,17 @@ out: /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. */ -static noinline int copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline void copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { int data_copy_size; int rt_data_off; int i; - int ret = 0; - int wret; struct btrfs_disk_key disk_key; nritems = nritems - mid; @@ -2780,12 +2754,9 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } btrfs_set_header_nritems(l, mid); - ret = 0; btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -2803,8 +2774,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); - - return ret; } /* @@ -2993,12 +2962,8 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3006,29 +2971,21 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, + insert_ptr(trans, root, path, &disk_key, right->start, path->slots[1], 1); - if (wret) - ret = wret; btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } + if (path->slots[1] == 0) + fixup_low_keys(trans, root, path, + &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; } - ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); - BUG_ON(ret); + copy_for_split(trans, root, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -3036,7 +2993,7 @@ again: goto again; } - return ret; + return 0; push_for_double: push_for_double_split(trans, root, path, data_size); @@ -3238,11 +3195,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - ret = setup_items_for_insert(trans, root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - BUG_ON(ret); - + setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -3257,10 +3212,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -3277,7 +3232,7 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, old_size = btrfs_item_size_nr(leaf, slot); if (old_size == new_size) - return 0; + return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3350,15 +3305,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* * make the item pointed to by the path bigger, data_size is the new size. */ -int btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) { int slot; struct extent_buffer *leaf; @@ -3416,7 +3370,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* @@ -3544,7 +3497,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -3562,17 +3515,16 @@ out: * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; - int ret; struct extent_buffer *leaf; int slot; @@ -3633,10 +3585,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(leaf, nritems + nr); - ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -3645,7 +3596,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return ret; } /* @@ -3672,16 +3622,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret == 0) return -EEXIST; if (ret < 0) - goto out; + return ret; slot = path->slots[0]; BUG_ON(slot < 0); - ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(trans, root, path, cpu_key, data_size, total_data, total_size, nr); - -out: - return ret; + return 0; } /* @@ -3717,13 +3665,11 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; - int ret = 0; - int wret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { @@ -3743,12 +3689,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); - return ret; } /* @@ -3761,17 +3704,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { - int ret; - WARN_ON(btrfs_header_generation(leaf) != trans->transid); - ret = del_ptr(trans, root, path, 1, path->slots[1]); - if (ret) - return ret; + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3782,7 +3721,6 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); - return 0; } /* * delete the item at the leaf level in path. If that empties @@ -3839,8 +3777,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { btrfs_set_path_blocking(path); clean_tree_block(trans, root, leaf); - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -3848,10 +3785,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, - &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -3879,9 +3813,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); free_extent_buffer(leaf); + ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 339e637ab27..30c5a247ab2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2485,8 +2485,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2549,8 +2549,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -int btrfs_set_block_group_rw(struct btrfs_root *root, - struct btrfs_block_group_cache *cache); +void btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_root *root, @@ -2569,9 +2569,9 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key); +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -2591,12 +2591,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, u32 data_size); -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2630,10 +2631,10 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2911,7 +2912,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); -int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); int btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -3021,10 +3022,10 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, /* scrub.c */ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly); -int btrfs_scrub_pause(struct btrfs_root *root); -int btrfs_scrub_pause_super(struct btrfs_root *root); -int btrfs_scrub_continue(struct btrfs_root *root); -int btrfs_scrub_continue_super(struct btrfs_root *root); +void btrfs_scrub_pause(struct btrfs_root *root); +void btrfs_scrub_pause_super(struct btrfs_root *root); +void btrfs_scrub_continue(struct btrfs_root *root); +void btrfs_scrub_continue_super(struct btrfs_root *root); int btrfs_scrub_cancel(struct btrfs_root *root); int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fe4cd0f1cef..6829590d0fb 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -836,10 +836,8 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ - ret = setup_items_for_insert(trans, root, path, keys, data_size, - total_data_size, total_size, nitems); - if (ret) - goto error; + setup_items_for_insert(trans, root, path, keys, data_size, + total_data_size, total_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 29ecd543d80..69f22e3ab3b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -420,7 +420,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, @@ -494,13 +494,12 @@ static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, @@ -554,13 +553,12 @@ static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, +static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, @@ -616,7 +614,6 @@ static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, delayed_refs->num_entries++; trans->delayed_ref_updates++; } - return 0; } /* @@ -634,7 +631,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; BUG_ON(extent_op && extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); @@ -656,14 +652,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, action, 0); - BUG_ON(ret); - ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - BUG_ON(ret); if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); @@ -685,7 +679,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; BUG_ON(extent_op && !extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); @@ -707,14 +700,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, action, 1); - BUG_ON(ret); - ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - BUG_ON(ret); if (!need_ref_seq(for_cow, ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); @@ -729,7 +720,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, { struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - int ret; head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); if (!head_ref) @@ -740,10 +730,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - BUG_ON(ret); if (waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 31d84e78129..76743308bd9 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -49,9 +49,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - ret = btrfs_extend_item(trans, root, path, data_size); - } - if (ret < 0) + btrfs_extend_item(trans, root, path, data_size); + } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; @@ -383,8 +382,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - ret = btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2e4428bd60e..73ccadce90b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,12 +50,12 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static int btrfs_destroy_ordered_operations(struct btrfs_root *root); -static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); -static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); @@ -1139,10 +1139,10 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, } } -static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, - u64 objectid) +static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) { root->node = NULL; root->commit_root = NULL; @@ -1194,7 +1194,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; - return 0; } static int __must_check find_and_setup_root(struct btrfs_root *tree_root, @@ -2897,7 +2896,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -2910,7 +2909,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); - return 0; } static void free_fs_root(struct btrfs_root *root) @@ -2927,7 +2925,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root); } -static int del_fs_roots(struct btrfs_fs_info *fs_info) +static void del_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2956,7 +2954,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_free_fs_root(fs_info, gang[i]); } - return 0; } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3299,7 +3296,7 @@ int btrfs_error_commit_super(struct btrfs_root *root) return ret; } -static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3321,11 +3318,9 @@ static int btrfs_destroy_ordered_operations(struct btrfs_root *root) spin_unlock(&root->fs_info->ordered_extent_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; } -static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct list_head splice; struct btrfs_ordered_extent *ordered; @@ -3357,8 +3352,6 @@ static int btrfs_destroy_ordered_extents(struct btrfs_root *root) } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -3413,7 +3406,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3431,11 +3424,9 @@ static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) kfree(snapshot); } - - return 0; } -static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3455,8 +3446,6 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) } spin_unlock(&root->fs_info->delalloc_lock); - - return 0; } static int btrfs_destroy_marked_extents(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index ca34237ae38..1608492d485 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -64,7 +64,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c32a9bffe9e..0daa1df1643 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1010,7 +1010,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, return ret; BUG_ON(ret); - ret = btrfs_extend_item(trans, root, path, new_size); + btrfs_extend_item(trans, root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1592,13 +1592,13 @@ out: * helper to add new inline back ref */ static noinline_for_stack -int setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1608,7 +1608,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 refs; int size; int type; - int ret; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1617,7 +1616,7 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - ret = btrfs_extend_item(trans, root, path, size); + btrfs_extend_item(trans, root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1652,7 +1651,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } btrfs_mark_buffer_dirty(leaf); - return 0; } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1687,12 +1685,12 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -int update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) +void update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1703,7 +1701,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, u32 item_size; int size; int type; - int ret; u64 refs; leaf = path->nodes[0]; @@ -1745,10 +1742,9 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - ret = btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(trans, root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); - return 0; } static noinline_for_stack @@ -1768,13 +1764,13 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - ret = update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); + update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - ret = setup_inline_extent_backref(trans, root, path, iref, - parent, root_objectid, - owner, offset, refs_to_add, - extent_op); + setup_inline_extent_backref(trans, root, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; } return ret; } @@ -1804,12 +1800,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data) { - int ret; + int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - ret = update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); + update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); } else if (is_data) { ret = remove_extent_data_ref(trans, root, path, refs_to_drop); } else { @@ -4734,7 +4730,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, return ret; } -int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, +void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4764,7 +4760,6 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->extent_commit_sem); update_global_block_rsv(fs_info); - return 0; } static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) @@ -7189,7 +7184,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -int btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -7205,7 +7200,6 @@ int btrfs_set_block_group_rw(struct btrfs_root *root, cache->ro = 0; spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); - return 0; } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8368baa1f37..65216fabacb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -142,6 +142,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) #endif atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -159,6 +160,7 @@ void free_extent_state(struct extent_state *state) list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif + trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -617,8 +619,8 @@ search_again: goto again; } -static int wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) __releases(tree->lock) __acquires(tree->lock) { @@ -628,7 +630,6 @@ static int wait_on_state(struct extent_io_tree *tree, schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - return 0; } /* @@ -636,7 +637,7 @@ static int wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) { struct extent_state *state; struct rb_node *node; @@ -673,7 +674,6 @@ again: } out: spin_unlock(&tree->lock); - return 0; } static void set_state_bits(struct extent_io_tree *tree, @@ -1359,9 +1359,9 @@ out: return found; } -static noinline int __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) +static noinline void __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) { int ret; struct page *pages[16]; @@ -1371,7 +1371,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode, int i; if (index == locked_page->index && end_index == index) - return 0; + return; while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -1386,7 +1386,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode, index += ret; cond_resched(); } - return 0; } static noinline int lock_delalloc_pages(struct inode *inode, @@ -1777,39 +1776,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, * helper function to set a given page up to date if all the * extents in the tree for that page are up to date */ -static int check_page_uptodate(struct extent_io_tree *tree, - struct page *page) +static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); - return 0; } /* * helper function to unlock a page if all the extents in the tree * for that page are unlocked */ -static int check_page_locked(struct extent_io_tree *tree, - struct page *page) +static void check_page_locked(struct extent_io_tree *tree, struct page *page) { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); - return 0; } /* * helper function to end page writeback if all the extents * in the tree for that page are done with writeback */ -static int check_page_writeback(struct extent_io_tree *tree, - struct page *page) +static void check_page_writeback(struct extent_io_tree *tree, + struct page *page) { end_page_writeback(page); - return 0; } /* @@ -3835,7 +3829,7 @@ void free_extent_buffer(struct extent_buffer *eb) WARN_ON(1); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, +void clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb) { unsigned long i; @@ -3867,7 +3861,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } - return 0; } int set_extent_buffer_dirty(struct extent_io_tree *tree, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cecc3518c12..f7f321ee4ba 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -287,8 +287,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); -int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -int clear_extent_buffer_dirty(struct extent_io_tree *tree, +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +void clear_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_io_tree *tree, struct extent_buffer *eb); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c7fb3a4247d..edb69b4d533 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -483,18 +483,17 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 bytenr, u64 len) +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) { struct extent_buffer *leaf; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; - int ret; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; @@ -510,7 +509,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 1); + btrfs_truncate_item(trans, root, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -522,15 +521,13 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - ret = btrfs_truncate_item(trans, root, path, new_size, 0); + btrfs_truncate_item(trans, root, path, new_size, 0); key->offset = end_byte; - ret = btrfs_set_item_key_safe(trans, root, path, key); - BUG_ON(ret); + btrfs_set_item_key_safe(trans, root, path, key); } else { BUG(); } - return 0; } /* @@ -639,9 +636,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - ret = truncate_one_csum(trans, root, path, - &key, bytenr, len); - BUG_ON(ret); + truncate_one_csum(trans, root, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -772,7 +767,7 @@ again: if (diff != csum_size) goto insert; - ret = btrfs_extend_item(trans, root, path, diff); + btrfs_extend_item(trans, root, path, diff); goto csum; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index baa74f3db69..c0792409bb6 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -128,7 +128,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - ret = btrfs_truncate_item(trans, root, path, + btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -165,7 +165,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ret = btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(trans, root, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5394bdb314c..dc800eefdbc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3164,8 +3164,8 @@ search_again: } size = btrfs_file_extent_calc_inline_size(size); - ret = btrfs_truncate_item(trans, root, path, - size, 1); + btrfs_truncate_item(trans, root, path, + size, 1); } else if (root->ref_cows) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); @@ -3782,7 +3782,7 @@ static void inode_tree_del(struct inode *inode) } } -int btrfs_invalidate_inodes(struct btrfs_root *root) +void btrfs_invalidate_inodes(struct btrfs_root *root) { struct rb_node *node; struct rb_node *prev; @@ -3842,7 +3842,6 @@ again: node = rb_next(node); } spin_unlock(&root->inode_lock); - return 0; } static int btrfs_init_locked_inode(struct inode *inode, void *p) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5e178d8f716..272f911203f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -208,7 +208,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * take a spinning write lock. This will wait for both * blocking readers or writers */ -int btrfs_tree_lock(struct extent_buffer *eb) +void btrfs_tree_lock(struct extent_buffer *eb) { again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); @@ -230,13 +230,12 @@ again: atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); eb->lock_owner = current->pid; - return 0; } /* * drop a spinning or a blocking write lock. */ -int btrfs_tree_unlock(struct extent_buffer *eb) +void btrfs_tree_unlock(struct extent_buffer *eb) { int blockers = atomic_read(&eb->blocking_writers); @@ -255,7 +254,6 @@ int btrfs_tree_unlock(struct extent_buffer *eb) atomic_dec(&eb->spinning_writers); write_unlock(&eb->lock); } - return 0; } void btrfs_assert_tree_locked(struct extent_buffer *eb) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 17247ddb81a..ca52681e5f4 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -24,8 +24,8 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 -int btrfs_tree_lock(struct extent_buffer *eb); -int btrfs_tree_unlock(struct extent_buffer *eb); +void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2857f28e20e..bbf6d0d9aeb 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -257,9 +257,9 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, * when an ordered extent is finished. If the list covers more than one * ordered extent, it is split across multiples. */ -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum) +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; @@ -267,7 +267,6 @@ int btrfs_add_ordered_sum(struct inode *inode, spin_lock(&tree->lock); list_add_tail(&sum->list, &entry->list); spin_unlock(&tree->lock); - return 0; } /* @@ -392,7 +391,7 @@ out: * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped */ -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { struct list_head *cur; struct btrfs_ordered_sum *sum; @@ -408,7 +407,6 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) } kfree(entry); } - return 0; } /* @@ -416,8 +414,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * and you must wake_up entry->wait. You must hold the tree lock * while you call this function. */ -static int __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +static void __btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -444,35 +442,30 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } /* * remove an ordered extent from the tree. No references are dropped * but any waiters are woken. */ -int btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; - int ret; tree = &BTRFS_I(inode)->ordered_tree; spin_lock(&tree->lock); - ret = __btrfs_remove_ordered_extent(inode, entry); + __btrfs_remove_ordered_extent(inode, entry); spin_unlock(&tree->lock); wake_up(&entry->wait); - - return ret; } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -520,7 +513,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); - return 0; } /* @@ -533,7 +525,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; @@ -581,8 +573,6 @@ again: spin_unlock(&root->fs_info->ordered_extent_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); - - return 0; } /* @@ -617,7 +607,7 @@ void btrfs_start_ordered_extent(struct inode *inode, /* * Used to wait on ordered extents across a large range of bytes. */ -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; u64 orig_end; @@ -672,7 +662,6 @@ again: schedule_timeout(1); goto again; } - return 0; } /* @@ -956,9 +945,8 @@ out: * If trans is not null, we'll do a friendly check for a transaction that * is already flushing things and force the IO down ourselves. */ -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) { u64 last_mod; @@ -969,7 +957,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * commit, we can safely return without doing anything */ if (last_mod < root->fs_info->last_trans_committed) - return 0; + return; /* * the transaction is already committing. Just start the IO and @@ -977,7 +965,7 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, */ if (trans && root->fs_info->running_transaction->blocked) { btrfs_wait_ordered_range(inode, 0, (u64)-1); - return 0; + return; } spin_lock(&root->fs_info->ordered_extent_lock); @@ -986,6 +974,4 @@ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, &root->fs_info->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); - - return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ff1f69aa188..c355ad4dc1a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -138,8 +138,8 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } -int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -int btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, @@ -154,14 +154,14 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int compress_type); -int btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum); +void btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); -int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, @@ -170,10 +170,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); -int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); -int btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +void btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index abc0fbffa51..a2e8aa40f3f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -948,12 +948,12 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) return fail; } -static int scrub_submit(struct scrub_dev *sdev) +static void scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; if (sdev->curr == -1) - return 0; + return; sbio = sdev->bios[sdev->curr]; sbio->err = 0; @@ -961,8 +961,6 @@ static int scrub_submit(struct scrub_dev *sdev) atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); - - return 0; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -1008,9 +1006,7 @@ again: sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - ret = scrub_submit(sdev); - if (ret) - return ret; + scrub_submit(sdev); goto again; } sbio->spag[sbio->count].flags = flags; @@ -1025,9 +1021,7 @@ again: ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); if (!ret) { __free_page(page); - ret = scrub_submit(sdev); - if (ret) - return ret; + scrub_submit(sdev); goto again; } @@ -1036,13 +1030,8 @@ again: memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); } ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) { - int ret; - - ret = scrub_submit(sdev); - if (ret) - return ret; - } + if (sbio->count == SCRUB_PAGES_PER_BIO || force) + scrub_submit(sdev); return 0; } @@ -1656,7 +1645,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, return ret; } -int btrfs_scrub_pause(struct btrfs_root *root) +void btrfs_scrub_pause(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1671,29 +1660,24 @@ int btrfs_scrub_pause(struct btrfs_root *root) mutex_lock(&fs_info->scrub_lock); } mutex_unlock(&fs_info->scrub_lock); - - return 0; } -int btrfs_scrub_continue(struct btrfs_root *root) +void btrfs_scrub_continue(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; atomic_dec(&fs_info->scrub_pause_req); wake_up(&fs_info->scrub_pause_wait); - return 0; } -int btrfs_scrub_pause_super(struct btrfs_root *root) +void btrfs_scrub_pause_super(struct btrfs_root *root) { down_write(&root->fs_info->scrub_super_lock); - return 0; } -int btrfs_scrub_continue_super(struct btrfs_root *root) +void btrfs_scrub_continue_super(struct btrfs_root *root) { up_write(&root->fs_info->scrub_super_lock); - return 0; } int btrfs_scrub_cancel(struct btrfs_root *root) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9774e38a053..ae7963b2d52 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1408,9 +1408,7 @@ static int __init init_btrfs_fs(void) if (err) return err; - err = btrfs_init_compress(); - if (err) - goto free_sysfs; + btrfs_init_compress(); err = btrfs_init_cachep(); if (err) @@ -1451,7 +1449,6 @@ free_cachep: btrfs_destroy_cachep(); free_compress: btrfs_exit_compress(); -free_sysfs: btrfs_exit_sysfs(); return err; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04b77e3ceb7..6e256d90fd2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1214,8 +1214,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); - ret = btrfs_wait_ordered_extents(root, 0, 1); - BUG_ON(ret); + btrfs_wait_ordered_extents(root, 0, 1); } ret = btrfs_run_delayed_items(trans, root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 966cc74f5d6..37b52b8c672 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -212,14 +212,13 @@ int btrfs_pin_log_trans(struct btrfs_root *root) * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -int btrfs_end_log_trans(struct btrfs_root *root) +void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { smp_mb(); if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } - return 0; } @@ -378,12 +377,11 @@ insert: u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (found_size > item_size) { + if (found_size > item_size) btrfs_truncate_item(trans, root, path, item_size, 1); - } else if (found_size < item_size) { - ret = btrfs_extend_item(trans, root, path, - item_size - found_size); - } + else if (found_size < item_size) + btrfs_extend_item(trans, root, path, + item_size - found_size); } else if (ret) { return ret; } @@ -1963,8 +1961,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, return 0; } -static int wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { DEFINE_WAIT(wait); while (root->fs_info->last_trans_log_full_commit != @@ -1978,7 +1976,6 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } - return 0; } /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 2270ac58d74..862ac813f6b 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -38,7 +38,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *inode, u64 dirid); -int btrfs_end_log_trans(struct btrfs_root *root); +void btrfs_end_log_trans(struct btrfs_root *root); int btrfs_pin_log_trans(struct btrfs_root *root); int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ef41f285a47..1c5f8a4a2bb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -67,7 +67,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } -int btrfs_cleanup_fs_uuids(void) +void btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -77,7 +77,6 @@ int btrfs_cleanup_fs_uuids(void) list_del(&fs_devices->list); free_fs_devices(fs_devices); } - return 0; } static noinline struct btrfs_device *__find_device(struct list_head *head, @@ -130,7 +129,7 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, * the list if the block device is congested. This way, multiple devices * can make progress from a single worker thread. */ -static noinline int run_scheduled_bios(struct btrfs_device *device) +static noinline void run_scheduled_bios(struct btrfs_device *device) { struct bio *pending; struct backing_dev_info *bdi; @@ -316,7 +315,6 @@ loop_lock: done: blk_finish_plug(&plug); - return 0; } static void pending_bios_fn(struct btrfs_work *work) @@ -455,7 +453,7 @@ error: return ERR_PTR(-ENOMEM); } -int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; @@ -503,7 +501,6 @@ again: fs_devices->latest_trans = latest_transid; mutex_unlock(&uuid_mutex); - return 0; } static void __free_device(struct work_struct *work) @@ -3967,7 +3964,7 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline int schedule_bio(struct btrfs_root *root, +static noinline void schedule_bio(struct btrfs_root *root, struct btrfs_device *device, int rw, struct bio *bio) { @@ -3979,7 +3976,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio_get(bio); btrfsic_submit_bio(rw, bio); bio_put(bio); - return 0; + return; } /* @@ -4013,7 +4010,6 @@ static noinline int schedule_bio(struct btrfs_root *root, if (should_queue) btrfs_queue_worker(&root->fs_info->submit_workers, &device->work); - return 0; } int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, @@ -4215,7 +4211,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return 0; } -static int fill_device_from_item(struct extent_buffer *leaf, +static void fill_device_from_item(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item, struct btrfs_device *device) { @@ -4232,8 +4228,6 @@ static int fill_device_from_item(struct extent_buffer *leaf, ptr = (unsigned long)btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - - return 0; } static int open_seed_devices(struct btrfs_root *root, u8 *fsid) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 19ac95048b8..bb6b03f97aa 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -260,12 +260,12 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); -int btrfs_cleanup_fs_uuids(void); +void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -- cgit v1.2.3 From d0082371cf086e0ba2bbd0367b2c9920532df24f Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:57:19 +0100 Subject: btrfs: drop gfp_t from lock_extent lock_extent and unlock_extent are always called with GFP_NOFS, drop the argument and use GFP_NOFS consistently. Signed-off-by: Jeff Mahoney --- fs/btrfs/compression.c | 6 +++--- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 41 +++++++++++++++++++---------------------- fs/btrfs/extent_io.h | 9 ++++----- fs/btrfs/file.c | 7 +++---- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 33 ++++++++++++++------------------- fs/btrfs/ioctl.c | 17 ++++++++--------- fs/btrfs/relocation.c | 22 ++++++++++------------ 9 files changed, 63 insertions(+), 76 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0ed1ed22775..cfd158dccde 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -496,7 +496,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * sure they map to this compressed extent on disk. */ set_page_extent_mapped(page); - lock_extent(tree, last_offset, end, GFP_NOFS); + lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); @@ -506,7 +506,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -534,7 +534,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, nr_pages++; page_cache_release(page); } else { - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 73ccadce90b..69ef456b32f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -332,7 +332,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, return 0; lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); if (extent_buffer_uptodate(io_tree, eb, cached_state) && btrfs_header_generation(eb) == parent_transid) { ret = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 65216fabacb..0112c02742f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1171,42 +1171,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state, gfp_t mask) + int bits, struct extent_state **cached_state) { int err; u64 failed_start; while (1) { err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, - cached_state, mask); - if (err == -EEXIST && (mask & __GFP_WAIT)) { + cached_state, GFP_NOFS); + if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; - } else { + } else break; - } WARN_ON(start > end); } return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return lock_extent_bits(tree, start, end, 0, NULL, mask); + return lock_extent_bits(tree, start, end, 0, NULL); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, mask); + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, mask); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); return 0; } return 1; @@ -1219,10 +1217,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - mask); + GFP_NOFS); } /* @@ -1518,8 +1516,7 @@ again: BUG_ON(ret); /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, - 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -2557,11 +2554,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end = page_end; while (1) { - lock_extent(tree, start, end, GFP_NOFS); + lock_extent(tree, start, end); ordered = btrfs_lookup_ordered_extent(inode, start); if (!ordered) break; - unlock_extent(tree, start, end, GFP_NOFS); + unlock_extent(tree, start, end); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } @@ -2598,7 +2595,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end - cur + 1, 0); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end, GFP_NOFS); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2650,7 +2647,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2660,7 +2657,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3274,7 +3271,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -3488,7 +3485,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state, GFP_NOFS); + &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f7f321ee4ba..439e183d45b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -182,14 +182,13 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached, gfp_t mask); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); + int bits, struct extent_state **cached); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e8d06b6b919..0eb80cc4ec8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1105,8 +1105,7 @@ again: if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state, - GFP_NOFS); + start_pos, last_pos - 1, 0, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, last_pos - 1); if (ordered && @@ -1638,7 +1637,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); + locked_end, 0, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -1737,7 +1736,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) return -ENXIO; lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, - &cached_state, GFP_NOFS); + &cached_state); /* * Delalloc is such a pain. If we have a hole and we have pending diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 710ea380c7e..ecbcc8711a0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -869,7 +869,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); node = rb_first(&ctl->free_space_offset); if (!node && cluster) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc800eefdbc..d16bf3f5da0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -597,7 +597,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); + async_extent->ram_size - 1); /* allocate blocks */ ret = cow_file_range(inode, async_cow->locked_page, @@ -625,8 +625,7 @@ retry: } lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1, - GFP_NOFS); + async_extent->start + async_extent->ram_size - 1); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -649,7 +648,7 @@ retry: async_extent->pages = NULL; unlock_extent(io_tree, async_extent->start, async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); + async_extent->ram_size - 1); goto retry; } @@ -1574,7 +1573,7 @@ again: page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, - &cached_state, GFP_NOFS); + &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) @@ -1765,7 +1764,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -3285,8 +3284,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -3362,7 +3360,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) btrfs_wait_ordered_range(inode, hole_start, block_end - hole_start); lock_extent_bits(io_tree, hole_start, block_end - 1, 0, - &cached_state, GFP_NOFS); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, hole_start); if (!ordered) break; @@ -5604,7 +5602,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, free_extent_map(em); /* DIO will do one hole at a time, so just unlock a sector */ unlock_extent(&BTRFS_I(inode)->io_tree, start, - start + root->sectorsize - 1, GFP_NOFS); + start + root->sectorsize - 1); return 0; } @@ -5745,7 +5743,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) } while (bvec <= bvec_end); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1, GFP_NOFS); + dip->logical_offset + dip->bytes - 1); bio->bi_private = dip->private; kfree(dip->csums); @@ -5796,7 +5794,7 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, ordered->file_offset + ordered->len - 1, 0, - &cached_state, GFP_NOFS); + &cached_state); if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { ret = btrfs_mark_extent_written(trans, inode, @@ -6211,7 +6209,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -6365,8 +6363,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); if (ordered) { @@ -6388,8 +6385,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) } btrfs_put_ordered_extent(ordered); cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state); } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -6464,8 +6460,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, - GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); set_page_extent_mapped(page); /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d8b54715c2d..4e514946143 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -797,9 +797,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, if (!em) { /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1, GFP_NOFS); + lock_extent(io_tree, start, start + len - 1); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); + unlock_extent(io_tree, start, start + len - 1); if (IS_ERR(em)) return 0; @@ -887,10 +887,10 @@ again: page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { - lock_extent(tree, page_start, page_end, GFP_NOFS); + lock_extent(tree, page_start, page_end); ordered = btrfs_lookup_ordered_extent(inode, page_start); - unlock_extent(tree, page_start, page_end, GFP_NOFS); + unlock_extent(tree, page_start, page_end); if (!ordered) break; @@ -946,8 +946,7 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state, - GFP_NOFS); + page_start, page_end - 1, 0, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, @@ -2326,13 +2325,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, another, and lock file content */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + lock_extent(&BTRFS_I(src)->io_tree, off, off+len); ordered = btrfs_lookup_first_ordered_extent(src, off+len); if (!ordered && !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, EXTENT_DELALLOC, 0, NULL)) break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); if (ordered) btrfs_put_ordered_extent(ordered); btrfs_wait_ordered_range(src, off, len); @@ -2551,7 +2550,7 @@ next: ret = 0; out: btrfs_release_path(path); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 974b0dfeffa..5e89a5ea692 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1604,15 +1604,14 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, root->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - GFP_NOFS); + key.offset, end); if (!ret) continue; btrfs_drop_extent_cache(inode, key.offset, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, GFP_NOFS); + key.offset, end); } } @@ -1983,9 +1982,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); btrfs_drop_extent_cache(inode, start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); } return 0; } @@ -2889,12 +2888,12 @@ int prealloc_file_extent_cluster(struct inode *inode, else end = cluster->end - offset; - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; @@ -2926,7 +2925,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, start, end); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -2937,7 +2936,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } btrfs_drop_extent_cache(inode, start, end, 0); } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); return ret; } @@ -3017,8 +3016,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); set_page_extent_mapped(page); @@ -3034,7 +3032,7 @@ static int relocate_file_extent_cluster(struct inode *inode, set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end, GFP_NOFS); + page_start, page_end); unlock_page(page); page_cache_release(page); -- cgit v1.2.3 From 3fbe5c02ae5a59053d779392b9a12aa8f6d6198e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:57:19 +0100 Subject: btrfs: split extent_state ops set_extent_bit can do exclusive locking but only when called by lock_extent*, Drop the exclusive bits argument except when called by lock_extent. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++++------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 25 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0112c02742f..ffa7cc3370c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -721,9 +721,10 @@ static void uncache_state(struct extent_state **cached_ptr) * [start, end] is inclusive This takes the tree lock. */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) +static int __must_check +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -917,6 +918,15 @@ search_again: goto again; } +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, 0, failed_start, + cached_state, mask); +} + + /** * convert_extent - convert all bits in a given range from one bit to another * @tree: the io tree to search @@ -1111,14 +1121,14 @@ search_again: int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return set_extent_bit(tree, start, end, bits, 0, NULL, + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } @@ -1133,7 +1143,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - 0, NULL, cached_state, mask); + NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -1147,7 +1157,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); } @@ -1155,7 +1165,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, - NULL, cached_state, mask); + cached_state, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -1176,9 +1186,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1199,8 +1209,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 439e183d45b..3a171c25927 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -209,7 +209,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, + int bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d16bf3f5da0..593a2c3a27a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6233,7 +6233,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, if (writing) { write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DELALLOC, 0, NULL, &cached_state, + EXTENT_DELALLOC, NULL, &cached_state, GFP_NOFS); if (ret) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, -- cgit v1.2.3 From 2c536799f1bde905bbacf7af3aa6be3f4de66005 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 3 Oct 2011 23:22:41 -0400 Subject: btrfs: btrfs_drop_snapshot should return int Commit cb1b69f4 (Btrfs: forced readonly when btrfs_drop_snapshot() fails) made btrfs_drop_snapshot return void because there were no callers checking the return value. That is the wrong order to handle error propogation since the caller will have no idea that an error has occured and continue on as if nothing went wrong. Signed-off-by: Jeff Mahoney --- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/relocation.c | 3 ++- fs/btrfs/transaction.c | 7 +++++-- 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 30c5a247ab2..f6bca05a4b4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2661,9 +2661,9 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0daa1df1643..cd6f8ae0a78 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6734,7 +6734,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -void btrfs_drop_snapshot(struct btrfs_root *root, +int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc) { @@ -6902,7 +6902,7 @@ out_free: out: if (err) btrfs_std_error(root->fs_info, err); - return; + return err; } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5e89a5ea692..a87678ead61 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2272,7 +2272,8 @@ again: } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); + BUG_ON(ret < 0); } if (found) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6e256d90fd2..fb5cd5a4adb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1387,6 +1387,8 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) spin_unlock(&fs_info->trans_lock); while (!list_empty(&list)) { + int ret; + root = list_entry(list.next, struct btrfs_root, root_list); list_del(&root->root_list); @@ -1394,9 +1396,10 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, NULL, 0, 0); + ret = btrfs_drop_snapshot(root, NULL, 0, 0); else - btrfs_drop_snapshot(root, NULL, 1, 0); + ret =btrfs_drop_snapshot(root, NULL, 1, 0); + BUG_ON(ret < 0); } return 0; } -- cgit v1.2.3 From ce598979be6f83549c90f42ba522a19a33727611 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 26 Jul 2011 11:32:23 -0700 Subject: btrfs: Don't BUG_ON errors from btrfs_create_subvol_root() This is called from only one place - create_subvol() which passes errors safely back out to it's caller, btrfs_mksubvol where they are handled. Additionally, btrfs_create_subvol_root() itself bug's needlessly from error return of btrfs_update_inode(). Since create_subvol() was fixed to catch errors we can bubble this one up too. Signed-off-by: Mark Fasheh --- fs/btrfs/inode.c | 3 +-- fs/btrfs/ioctl.c | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 593a2c3a27a..41132339e2e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6734,10 +6734,9 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, new_root, inode); - BUG_ON(err); iput(inode); - return 0; + return err; } struct inode *btrfs_alloc_inode(struct super_block *sb) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4e514946143..de25e4255ae 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -430,6 +430,11 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, new_dirid); + if (ret) { + /* We potentially lose an unused inode item here */ + goto fail; + } + /* * insert the directory item */ -- cgit v1.2.3 From be1a5564fd39fa2ca6adbb41c75fb08f96a1ffcb Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 8 Aug 2011 13:20:18 -0700 Subject: btrfs: Don't BUG_ON() errors in update_ref_for_cow() The only caller of update_ref_for_cow() is __btrfs_cow_block() which was originally ignoring any return values. update_ref_for_cow() however doesn't look like a candidate to become a void function - there are a few places where errors can occur. So instead I changed update_ref_for_cow() to bubble all errors up (instead of BUG_ON). __btrfs_cow_block() was then updated to catch and BUG_ON() any errors from update_ref_for_cow(). The end effect is that we have no change in behavior, but about 8 different places where a BUG_ON(ret) was removed. Obviously a future patch will have to address the BUG_ON() in __btrfs_cow_block(). Signed-off-by: Mark Fasheh --- fs/btrfs/ctree.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9d472c2a8de..e2e43c07f6b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -331,7 +331,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, buf->len, &refs, &flags); - BUG_ON(ret); + if (ret) + return ret; BUG_ON(refs == 0); } else { refs = 1; @@ -375,7 +376,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, buf->start, buf->len, new_flags, 0); - BUG_ON(ret); + if (ret) + return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { @@ -415,7 +417,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *cow; - int level; + int level, ret; int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -467,7 +469,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow, &last_ref); + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + BUG_ON(ret); if (root->ref_cows) btrfs_reloc_cow_block(trans, root, buf, cow); -- cgit v1.2.3 From 0678b61851b510ba68341dff59cd9b47e1712e91 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 5 Aug 2011 15:46:16 -0700 Subject: btrfs: Don't BUG_ON kzalloc error in btrfs_lookup_csums_range() Unfortunately it isn't enough to just exit here - the kzalloc() happens in a loop and the allocated items are added to a linked list whose head is passed in from the caller. To fix the BUG_ON() and also provide the semantic that the list passed in is only modified on success, I create function-local temporary list that we add items too. If no error is met, that list is spliced to the callers at the end of the function. Otherwise the list will be walked and all items freed before the error value is returned. I did a simple test on this patch by forcing an error at the kzalloc() point and verifying that when this hits (git clone seemed to exercise this), the function throws the proper error. Unfortunately but predictably, we later hit a BUG_ON(ret) type line that still hasn't been fixed up ;) Signed-off-by: Mark Fasheh --- fs/btrfs/file-item.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index edb69b4d533..89af104c756 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -284,6 +284,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; + LIST_HEAD(tmplist); unsigned long offset; int ret; size_t size; @@ -358,7 +359,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS); - BUG_ON(!sums); + if (!sums) { + ret = -ENOMEM; + goto fail; + } sector_sum = sums->sums; sums->bytenr = start; @@ -380,12 +384,19 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, offset += csum_size; sector_sum++; } - list_add_tail(&sums->list, list); + list_add_tail(&sums->list, &tmplist); } path->slots[0]++; } ret = 0; fail: + while (ret < 0 && !list_empty(&tmplist)) { + sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + list_splice_tail(&tmplist, list); + btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 4ed1d16e944c61cfb8a78159548672e7df168d97 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 10 Aug 2011 12:32:10 -0700 Subject: btrfs: Don't BUG_ON errors in __finish_chunk_alloc() All callers of __finish_chunk_alloc() BUG_ON() return value, so it's trivial for us to always bubble up any errors caught in __finish_chunk_alloc() to be caught there. Signed-off-by: Mark Fasheh --- fs/btrfs/volumes.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c5f8a4a2bb..c4ea7d8bea0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3417,16 +3417,18 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, key.offset = chunk_offset; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - BUG_ON(ret); - if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + /* + * TODO: Cleanup of inserted chunk root in case of + * failure. + */ ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); - BUG_ON(ret); } kfree(chunk); - return 0; + return ret; } /* -- cgit v1.2.3 From e5df957328b18baa731307c66cfe8e7a4981df65 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 29 Aug 2011 14:17:04 -0700 Subject: btrfs: Go readonly on bad extent refs in update_ref_for_cow() update_ref_for_cow() will BUG_ON() after it's call to btrfs_lookup_extent_info() if no existing references are found. Since refs are computed directly from disk, this should be treated as a corruption instead of a logic error. Signed-off-by: Mark Fasheh --- fs/btrfs/ctree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e2e43c07f6b..3b767d2b68e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -333,7 +333,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, buf->len, &refs, &flags); if (ret) return ret; - BUG_ON(refs == 0); + if (refs == 0) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + return ret; + } } else { refs = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || -- cgit v1.2.3 From b68dc2a93e794c8507338c91577a277efa4555d5 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 29 Aug 2011 14:30:39 -0700 Subject: btrfs: Don't BUG_ON errors from update_ref_for_cow() __btrfs_cow_block(), the only caller of update_ref_for_cow() will BUG_ON() any error return. Instead, we can go read-only fs as update_ref_for_cow() manipulates disk data in a way which doesn't look like it's easily rolled back. Signed-off-by: Mark Fasheh --- fs/btrfs/ctree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3b767d2b68e..36e16bd5079 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -474,7 +474,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, BTRFS_FSID_SIZE); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - BUG_ON(ret); + if (ret) { + btrfs_std_error(root->fs_info, ret); + return ret; + } if (root->ref_cows) btrfs_reloc_cow_block(trans, root, buf, cow); -- cgit v1.2.3 From 305a26af5b2561a66859ef05ed7eb73d3c9f0913 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 1 Sep 2011 11:27:57 -0700 Subject: btrfs: Go readonly on tree errors in balance_level balace_level() seems to deal with missing tree nodes by BUG_ON(). Instead, we can easily just set the file system readonly and bubble -EROFS back up the stack. Signed-off-by: Mark Fasheh --- fs/btrfs/ctree.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 36e16bd5079..651a26a6c65 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -944,7 +944,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - BUG_ON(!child); + if (!child) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); @@ -1042,7 +1047,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - BUG_ON(!left); + if (!left) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } wret = balance_node_right(trans, root, mid, left); if (wret < 0) { ret = wret; -- cgit v1.2.3 From 2cdcecbc153c222fae1be6f8ddb320b29e3a5200 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 8 Sep 2011 17:14:32 -0700 Subject: btrfs: Don't BUG_ON insert errors in btrfs_alloc_dev_extent() The only caller of btrfs_alloc_dev_extent() is __btrfs_alloc_chunk() which already bugs on any error returned. We can remove the BUG_ON's in btrfs_alloc_dev_extent() then since __btrfs_alloc_chunk() will "catch" them anyway. Signed-off-by: Mark Fasheh --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c4ea7d8bea0..281ff354f64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1075,7 +1075,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_EXTENT_KEY; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1090,6 +1091,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); +out: btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 1dd4602fa74273c28b3577a58aa389f330e9a0dc Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 8 Sep 2011 17:29:00 -0700 Subject: btrfs: Remove BUG_ON from __btrfs_alloc_chunk() We BUG_ON() error from add_extent_mapping(), but that error looks pretty easy to bubble back up - as far as I can tell there have not been any permanent modifications to fs state at that point. Signed-off-by: Mark Fasheh --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 281ff354f64..f002973959d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3327,8 +3327,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - BUG_ON(ret); free_extent_map(em); + if (ret) + goto error; ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, -- cgit v1.2.3 From 3acd395317f22b4346a139571cd4723408c5d4af Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 8 Sep 2011 17:40:01 -0700 Subject: btrfs: Remove BUG_ON from __finish_chunk_alloc() btrfs_alloc_chunk() unconditionally BUGs on any error returned from __finish_chunk_alloc() so there's no need for two BUG_ON lines. Remove the one from __finish_chunk_alloc(). Signed-off-by: Mark Fasheh --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f002973959d..394bf15ea18 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3383,7 +3383,8 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, device = map->stripes[index].dev; device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); - BUG_ON(ret); + if (ret) + goto out_free; index++; } @@ -3430,6 +3431,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); } +out_free: kfree(chunk); return ret; } -- cgit v1.2.3 From 4da35113426d16673aa1fb0613c14ca2e419e7fd Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 14:57:30 +0100 Subject: btrfs: add varargs to btrfs_error btrfs currently handles most errors with BUG_ON. This patch is a work-in- progress but aims to handle most errors other than internal logic errors and ENOMEM more gracefully. This iteration prevents most crashes but can run into lockups with the page lock on occasion when the timing "works out." Signed-off-by: Jeff Mahoney --- fs/btrfs/ctree.h | 12 +++++++++-- fs/btrfs/super.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 66 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f6bca05a4b4..8829f809985 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2964,13 +2964,21 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno); + unsigned int line, int errno, const char *fmt, ...); #define btrfs_std_error(fs_info, errno) \ do { \ if ((errno)) \ - __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ + __btrfs_std_error((fs_info), __func__, \ + __LINE__, (errno), NULL); \ +} while (0) + +#define btrfs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_std_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ } while (0) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ae7963b2d52..7fe69eef760 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -127,25 +127,74 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * invokes the approciate error response. */ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno) + unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; char nbuf[16]; const char *errstr; + va_list args; + va_start(args, fmt); /* * Special case: if the error is EROFS, and we're already * under MS_RDONLY, then it is safe here. */ if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; + return; - errstr = btrfs_decode_error(fs_info, errno, nbuf); - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); - save_error_info(fs_info); + errstr = btrfs_decode_error(fs_info, errno, nbuf); + if (fmt) { + struct va_format vaf = { + .fmt = fmt, + .va = &args, + }; + + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", + sb->s_id, function, line, errstr, &vaf); + } else { + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + } + + /* Don't go through full error handling during mount */ + if (sb->s_flags & MS_BORN) { + save_error_info(fs_info); + btrfs_handle_error(fs_info); + } + va_end(args); +} - btrfs_handle_error(fs_info); +const char *logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; + char lvl[4]; + struct va_format vaf; + va_list args; + const char *type = logtypes[4]; + + va_start(args, fmt); + + if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { + strncpy(lvl, fmt, 3); + fmt += 3; + type = logtypes[fmt[1] - '0']; + } else + *lvl = '\0'; + + vaf.fmt = fmt; + vaf.va = &args; + printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); } /* -- cgit v1.2.3 From 49b25e0540904be0bf558b84475c69d72e4de66e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 1 Mar 2012 17:24:58 +0100 Subject: btrfs: enhance transaction abort infrastructure Signed-off-by: Jeff Mahoney --- fs/btrfs/ctree.h | 13 +++- fs/btrfs/disk-io.c | 50 +++++++++++-- fs/btrfs/disk-io.h | 4 ++ fs/btrfs/relocation.c | 14 ++-- fs/btrfs/scrub.c | 8 ++- fs/btrfs/super.c | 74 ++++++++++++++++--- fs/btrfs/transaction.c | 190 ++++++++++++++++++++++++++++++++++++++++--------- fs/btrfs/transaction.h | 3 + 8 files changed, 300 insertions(+), 56 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8829f809985..b6ebea5582c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2968,6 +2968,16 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno); + +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + __btrfs_abort_transaction(trans, root, __func__, \ + __LINE__, errno); \ +} while (0) + #define btrfs_std_error(fs_info, errno) \ do { \ if ((errno)) \ @@ -3024,7 +3034,7 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); -void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ @@ -3034,6 +3044,7 @@ void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); void btrfs_scrub_continue_super(struct btrfs_root *root); +int __btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel(struct btrfs_root *root); int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 69ef456b32f..6297a030ac5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -61,7 +61,6 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, int mark); static int btrfs_destroy_pinned_extent(struct btrfs_root *root, struct extent_io_tree *pinned_extents); -static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -2896,6 +2895,19 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } +/* Kill all outstanding I/O */ +void btrfs_abort_devices(struct btrfs_root *root) +{ + struct list_head *head; + struct btrfs_device *dev; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + blk_abort_queue(dev->bdev->bd_disk->queue); + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +} + void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); @@ -3536,13 +3548,43 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, return 0; } -static int btrfs_cleanup_transaction(struct btrfs_root *root) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, + struct btrfs_root *root) +{ + btrfs_destroy_delayed_refs(cur_trans, root); + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + cur_trans->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + cur_trans->in_commit = 1; + cur_trans->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + cur_trans->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + + cur_trans->commit_done = 1; + if (waitqueue_active(&cur_trans->commit_wait)) + wake_up(&cur_trans->commit_wait); + + btrfs_destroy_pending_snapshots(cur_trans); + + btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, + EXTENT_DIRTY); + + /* + memset(cur_trans, 0, sizeof(*cur_trans)); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + */ +} + +int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; LIST_HEAD(list); - WARN_ON(1); - mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 1608492d485..a7ace1a2dd1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -85,6 +85,10 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_cleanup_transaction(struct btrfs_root *root); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, + struct btrfs_root *root); +void btrfs_abort_devices(struct btrfs_root *root); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a87678ead61..cba7a0bf366 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4410,7 +4410,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, * called after snapshot is created. migrate block reservation * and create reloc root for the newly created snapshot */ -void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_root *root = pending->root; @@ -4420,7 +4420,7 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return; + return 0; rc = root->fs_info->reloc_ctl; rc->merging_rsv_size += rc->nodes_relocated; @@ -4429,19 +4429,21 @@ void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, rc->nodes_relocated); - BUG_ON(ret); + if (ret) + return ret; } new_root = pending->snap; reloc_root = create_reloc_root(trans, root->reloc_root, new_root->root_key.objectid); + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); BUG_ON(ret < 0); new_root->reloc_root = reloc_root; - if (rc->create_reloc_tree) { + if (rc->create_reloc_tree) ret = clone_backref_node(trans, rc, root, reloc_root); - BUG_ON(ret); - } + return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a2e8aa40f3f..794cbb52f30 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1680,9 +1680,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root) up_write(&root->fs_info->scrub_super_lock); } -int btrfs_scrub_cancel(struct btrfs_root *root) +int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; mutex_lock(&fs_info->scrub_lock); if (!atomic_read(&fs_info->scrubs_running)) { @@ -1703,6 +1702,11 @@ int btrfs_scrub_cancel(struct btrfs_root *root) return 0; } +int btrfs_scrub_cancel(struct btrfs_root *root) +{ + return __btrfs_scrub_cancel(root->fs_info); +} + int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7fe69eef760..0517bd70b04 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -119,6 +119,8 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); + __btrfs_scrub_cancel(fs_info); +// WARN_ON(1); } } @@ -197,6 +199,34 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); } +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *function, + unsigned int line, int errno) +{ + WARN_ON_ONCE(1); + trans->aborted = errno; + /* Nothing used. The other threads that have joined this + * transaction may be able to continue. */ + if (!trans->blocks_used) { + btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); + return; + } + trans->transaction->aborted = errno; + __btrfs_std_error(root->fs_info, function, line, errno, NULL); +} /* * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. @@ -295,6 +325,7 @@ static match_table_t tokens = { /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. + * XXX JDM: This needs to be cleaned up for remount. */ int btrfs_parse_options(struct btrfs_root *root, char *options) { @@ -1096,11 +1127,20 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; + unsigned old_flags = sb->s_flags; + unsigned long old_opts = fs_info->mount_opt; + unsigned long old_compress_type = fs_info->compress_type; + u64 old_max_inline = fs_info->max_inline; + u64 old_alloc_start = fs_info->alloc_start; + int old_thread_pool_size = fs_info->thread_pool_size; + unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; ret = btrfs_parse_options(root, data); - if (ret) - return -EINVAL; + if (ret) { + ret = -EINVAL; + goto restore; + } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -1108,26 +1148,44 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (*flags & MS_RDONLY) { sb->s_flags |= MS_RDONLY; - ret = btrfs_commit_super(root); - WARN_ON(ret); + ret = btrfs_commit_super(root); + if (ret) + goto restore; } else { if (fs_info->fs_devices->rw_devices == 0) - return -EACCES; + ret = -EACCES; + goto restore; if (btrfs_super_log_root(fs_info->super_copy) != 0) - return -EINVAL; + ret = -EINVAL; + goto restore; ret = btrfs_cleanup_fs_roots(fs_info); - WARN_ON(ret); + if (ret) + goto restore; /* recover relocation */ ret = btrfs_recover_relocation(root); - WARN_ON(ret); + if (ret) + goto restore; sb->s_flags &= ~MS_RDONLY; } return 0; + +restore: + /* We've hit an error - don't reset MS_RDONLY */ + if (sb->s_flags & MS_RDONLY) + old_flags |= MS_RDONLY; + sb->s_flags = old_flags; + fs_info->mount_opt = old_opts; + fs_info->compress_type = old_compress_type; + fs_info->max_inline = old_max_inline; + fs_info->alloc_start = old_alloc_start; + fs_info->thread_pool_size = old_thread_pool_size; + fs_info->metadata_ratio = old_metadata_ratio; + return ret; } /* Used to sort the devices by max_avail(descending sort) */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fb5cd5a4adb..5a4999aa8fe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,7 +31,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -static noinline void put_transaction(struct btrfs_transaction *transaction) +void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { @@ -58,6 +58,12 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) spin_lock(&root->fs_info->trans_lock); loop: + /* The file system has been taken offline. No new transactions. */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&root->fs_info->trans_lock); + return -EROFS; + } + if (root->fs_info->trans_no_join) { if (!nofail) { spin_unlock(&root->fs_info->trans_lock); @@ -67,6 +73,8 @@ loop: cur_trans = root->fs_info->running_transaction; if (cur_trans) { + if (cur_trans->aborted) + return cur_trans->aborted; atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -123,6 +131,7 @@ loop: root->fs_info->generation++; cur_trans->transid = root->fs_info->generation; root->fs_info->running_transaction = cur_trans; + cur_trans->aborted = 0; spin_unlock(&root->fs_info->trans_lock); return 0; @@ -318,6 +327,7 @@ again: h->use_count = 1; h->block_rsv = NULL; h->orig_rsv = NULL; + h->aborted = 0; smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -440,6 +450,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; + int err; smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) @@ -453,8 +464,11 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; - if (updates) - btrfs_run_delayed_refs(trans, root, updates); + if (updates) { + err = btrfs_run_delayed_refs(trans, root, updates); + if (err) /* Error code will also eval true */ + return err; + } trans->block_rsv = rsv; @@ -525,6 +539,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(root); + if (trans->aborted || + root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + return -EIO; + } + return 0; } @@ -690,11 +709,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); + if (ret) + return ret; old_root_used = btrfs_root_used(&root->root_item); ret = btrfs_write_dirty_block_groups(trans, root); - BUG_ON(ret); + if (ret) + return ret; } if (root != root->fs_info->extent_root) @@ -705,6 +726,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, /* * update all the cowonly tree roots on disk + * + * The error handling in this function may not be obvious. Any of the + * failures will cause the file system to go offline. We still need + * to clean up the delayed refs. */ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -715,22 +740,30 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, + 0, &eb); btrfs_tree_unlock(eb); free_extent_buffer(eb); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) + return ret; while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); - update_cowonly_root(trans, root); + ret = update_cowonly_root(trans, root); + if (ret) + return ret; } down_write(&fs_info->extent_commit_sem); @@ -874,7 +907,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - pending->error = -ENOMEM; + ret = pending->error = -ENOMEM; goto fail; } @@ -911,7 +944,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -920,12 +953,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->error = -EEXIST; dput(parent); goto fail; - } + } else if (ret) + goto abort_trans; btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); ret = btrfs_update_inode(trans, parent_root, parent_inode); - BUG_ON(ret); + if (ret) + goto abort_trans; /* * pull in the delayed directory update @@ -934,7 +969,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans, root); - BUG_ON(ret); + if (ret) /* Transaction aborted */ + goto fail; record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); @@ -949,10 +985,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_flags(new_root_item, root_flags); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + if (ret) + goto abort_trans; + btrfs_set_lock_blocking(old); - btrfs_copy_root(trans, root, old, &tmp, objectid); + ret = btrfs_copy_root(trans, root, old, &tmp, objectid); + if (ret) + goto abort_trans; + btrfs_tree_unlock(old); free_extent_buffer(old); @@ -966,7 +1008,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); - BUG_ON(ret); + if (ret) + goto abort_trans; /* * insert root back/forward references @@ -975,19 +1018,28 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) + goto fail; dput(parent); key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(IS_ERR(pending->snap)); + if (IS_ERR(pending->snap)) + goto abort_trans; - btrfs_reloc_post_snapshot(trans, pending); + ret = btrfs_reloc_post_snapshot(trans, pending); + if (ret) + goto abort_trans; + ret = 0; fail: kfree(new_root_item); trans->block_rsv = rsv; btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); - return 0; + return ret; + +abort_trans: + btrfs_abort_transaction(trans, root, ret); + goto fail; } /* @@ -1124,6 +1176,33 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, return 0; } + +static void cleanup_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + WARN_ON(trans->use_count > 1); + + spin_lock(&root->fs_info->trans_lock); + list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + + btrfs_cleanup_one_transaction(trans->transaction, root); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + trace_btrfs_transaction_commit(root); + + btrfs_scrub_continue(root); + + if (current->journal_info == trans) + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); +} + /* * btrfs_transaction state sequence: * in_commit = 0, blocked = 0 (initial) @@ -1135,10 +1214,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { unsigned long joined = 0; - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret; + int ret = -EIO; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); @@ -1148,13 +1227,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (cur_trans->aborted) + goto cleanup_transaction; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); + if (ret) + goto cleanup_transaction; cur_trans = trans->transaction; + /* * set the flushing flag so procs in this transaction have to * start sending their work down. @@ -1162,19 +1246,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->delayed_refs.flushing = 1; ret = btrfs_run_delayed_refs(trans, root, 0); - BUG_ON(ret); + if (ret) + goto cleanup_transaction; spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { spin_unlock(&cur_trans->commit_lock); atomic_inc(&cur_trans->use_count); - btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans, root); wait_for_commit(root, cur_trans); put_transaction(cur_trans); - return 0; + return ret; } trans->transaction->in_commit = 1; @@ -1218,7 +1303,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } ret = btrfs_run_delayed_items(trans, root); - BUG_ON(ret); + if (ret) + goto cleanup_transaction; /* * rename don't use btrfs_join_transaction, so, once we @@ -1260,13 +1346,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->reloc_mutex); ret = btrfs_run_delayed_items(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } ret = create_pending_snapshots(trans, root->fs_info); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } /* * make sure none of the code above managed to slip in a @@ -1293,7 +1388,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->tree_log_mutex); ret = commit_fs_roots(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -1301,7 +1399,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_free_log_root_tree(trans, root->fs_info); ret = commit_cowonly_roots(trans, root); - BUG_ON(ret); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } btrfs_prepare_extent_commit(trans, root); @@ -1335,8 +1436,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wake_up(&root->fs_info->transaction_wait); ret = btrfs_write_and_wait_transaction(trans, root); - BUG_ON(ret); - write_ctree_super(trans, root, 0); + if (ret) { + btrfs_error(root->fs_info, ret, + "Error while writing out transaction."); + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } + + ret = write_ctree_super(trans, root, 0); + if (ret) { + mutex_unlock(&root->fs_info->tree_log_mutex); + goto cleanup_transaction; + } /* * the super is written, we can safely allow the tree-loggers @@ -1372,6 +1483,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); return ret; + +cleanup_transaction: + btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); +// WARN_ON(1); + if (current->journal_info == trans) + current->journal_info = NULL; + cleanup_transaction(trans, root); + + return ret; } /* diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 02564e6230a..fe27379e368 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -43,6 +43,7 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct btrfs_delayed_ref_root delayed_refs; + int aborted; }; struct btrfs_trans_handle { @@ -55,6 +56,7 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + int aborted; }; struct btrfs_pending_snapshot { @@ -114,4 +116,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); +void put_transaction(struct btrfs_transaction *transaction); #endif -- cgit v1.2.3 From 79787eaab46121d4713ed03c8fc63b9ec3eaec76 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 12 Mar 2012 16:03:00 +0100 Subject: btrfs: replace many BUG_ONs with proper error handling btrfs currently handles most errors with BUG_ON. This patch is a work-in- progress but aims to handle most errors other than internal logic errors and ENOMEM more gracefully. This iteration prevents most crashes but can run into lockups with the page lock on occasion when the timing "works out." Signed-off-by: Jeff Mahoney --- fs/btrfs/compression.c | 24 +-- fs/btrfs/ctree.c | 19 +-- fs/btrfs/delayed-inode.c | 27 +++- fs/btrfs/dir-item.c | 1 + fs/btrfs/disk-io.c | 116 ++++++++++---- fs/btrfs/export.c | 2 +- fs/btrfs/extent-tree.c | 316 +++++++++++++++++++++++++----------- fs/btrfs/extent_io.c | 40 ++--- fs/btrfs/file-item.c | 13 +- fs/btrfs/file.c | 45 ++++-- fs/btrfs/free-space-cache.c | 13 +- fs/btrfs/inode-item.c | 1 + fs/btrfs/inode-map.c | 19 ++- fs/btrfs/inode.c | 378 ++++++++++++++++++++++++++++++++------------ fs/btrfs/ioctl.c | 79 +++++++-- fs/btrfs/orphan.c | 2 +- fs/btrfs/relocation.c | 35 ++-- fs/btrfs/root-tree.c | 12 +- fs/btrfs/scrub.c | 3 + fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 36 +++-- fs/btrfs/tree-log.c | 79 +++++++-- fs/btrfs/volumes.c | 103 ++++++++---- 23 files changed, 980 insertions(+), 385 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cfd158dccde..d11afa67c7d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -391,16 +391,16 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, */ atomic_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); @@ -420,15 +420,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_get(bio); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); return 0; @@ -661,7 +661,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * inc the count before we submit the bio so @@ -674,14 +674,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); @@ -697,15 +697,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 651a26a6c65..e697afd1815 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -356,14 +356,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_inc_ref(trans, root, cow, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { @@ -373,7 +373,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1, 1); else ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { ret = btrfs_set_disk_extent_flags(trans, root, @@ -390,9 +390,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1, 1); else ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, buf, 1, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); *last_ref = 1; @@ -475,7 +475,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_std_error(root->fs_info, ret); + btrfs_abort_transaction(trans, root, ret); return ret; } @@ -2713,7 +2713,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; + if (ret == -ENOSPC) + ret = 1; goto out; } @@ -4017,7 +4018,7 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); + BUG_ON(!cur); /* -ENOMEM */ btrfs_tree_read_lock(cur); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6829590d0fb..03e3748d84d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -115,6 +115,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) return NULL; } +/* Will return either the node or PTR_ERR(-ENOMEM) */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct inode *inode) { @@ -1106,16 +1107,25 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, return 0; } -/* Called when committing the transaction. */ +/* + * Called when committing the transaction. + * Returns 0 on success. + * Returns < 0 on error and returns with an aborted transaction with any + * outstanding delayed items cleaned up. + */ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; + if (trans->aborted) + return -EIO; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1128,17 +1138,18 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node) { - root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, root, + curr_root = curr_node->root; + ret = btrfs_insert_delayed_items(trans, path, curr_root, curr_node); if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - curr_node); + ret = btrfs_delete_delayed_items(trans, path, + curr_root, curr_node); if (!ret) - ret = btrfs_update_delayed_inode(trans, root, path, - curr_node); + ret = btrfs_update_delayed_inode(trans, curr_root, + path, curr_node); if (ret) { btrfs_release_delayed_node(curr_node); + btrfs_abort_transaction(trans, root, ret); break; } @@ -1149,6 +1160,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, btrfs_free_path(path); trans->block_rsv = block_rsv; + return ret; } @@ -1369,6 +1381,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) btrfs_wq_run_delayed_node(delayed_root, root, 0); } +/* Will return 0 or -ENOMEM */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 76743308bd9..c1a074d0696 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -115,6 +115,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). + * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6297a030ac5..16a0cada26c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -98,6 +98,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; + int error; }; /* @@ -405,7 +406,7 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) u64 found_start; unsigned long len; struct extent_buffer *eb; - int ret; + int ret = -EIO; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -423,13 +424,20 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) eb = alloc_extent_buffer(tree, start, len, page); if (eb == NULL) { WARN_ON(1); + ret = -ENOMEM; goto out; } ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); - BUG_ON(ret); + if (ret) { + btrfs_printk(root->fs_info, KERN_WARNING + "Failed to checksum dirty buffer @ %llu[%lu]\n", + start, len); + goto err; + } WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + ret = -EIO; found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); @@ -444,10 +452,11 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) goto err; } csum_tree_block(root, eb, 0); + ret = 0; err: free_extent_buffer(eb); out: - return 0; + return ret; } static int check_tree_block_fsid(struct btrfs_root *root, @@ -718,11 +727,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; + int ret; async = container_of(work, struct async_submit_bio, work); - async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + ret = async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags, + async->bio_offset); + if (ret) + async->error = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -743,6 +755,12 @@ static void run_one_async_done(struct btrfs_work *work) waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); + /* If an error occured we just want to clean up the bio and move on */ + if (async->error) { + bio_endio(async->bio, async->error); + return; + } + async->submit_bio_done(async->inode, async->rw, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); @@ -784,6 +802,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; + async->error = 0; + atomic_inc(&fs_info->nr_async_submits); if (rw & REQ_SYNC) @@ -805,15 +825,18 @@ static int btree_csum_one_bio(struct bio *bio) struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; struct btrfs_root *root; + int ret = 0; WARN_ON(bio->bi_vcnt <= 0); while (bio_index < bio->bi_vcnt) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root, bvec->bv_page); + if (ret) + break; bio_index++; bvec++; } - return 0; + return ret; } static int __btree_submit_bio_start(struct inode *inode, int rw, @@ -825,8 +848,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - btree_csum_one_bio(bio); - return 0; + return btree_csum_one_bio(bio); } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, @@ -1381,7 +1403,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); + BUG_ON(!root->node); /* -ENOMEM */ out: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; @@ -1618,7 +1640,6 @@ static int transaction_kthread(void *arg) u64 transid; unsigned long now; unsigned long delay; - int ret; do { delay = HZ * 30; @@ -1642,11 +1663,12 @@ static int transaction_kthread(void *arg) transid = cur->transid; spin_unlock(&root->fs_info->trans_lock); + /* If the file system is aborted, this will always fail. */ trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + goto sleep; if (transid == trans->transid) { - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + btrfs_commit_transaction(trans, root); } else { btrfs_end_transaction(trans, root); } @@ -2289,7 +2311,7 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), blocksize, generation); - BUG_ON(!chunk_root->node); + BUG_ON(!chunk_root->node); /* -ENOMEM */ if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); @@ -2429,21 +2451,31 @@ retry_root_backup: log_tree_root->node = read_tree_block(tree_root, bytenr, blocksize, generation + 1); + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); - BUG_ON(ret); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_trans_kthread; + } if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - BUG_ON(ret); + ret = btrfs_commit_super(tree_root); + if (ret) + goto fail_trans_kthread; } } ret = btrfs_find_orphan_roots(tree_root); - BUG_ON(ret); + if (ret) + goto fail_trans_kthread; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); - BUG_ON(ret); + if (ret) { + } ret = btrfs_recover_relocation(tree_root); if (ret < 0) { @@ -2863,6 +2895,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); + + /* This shouldn't happen. FUA is masked off if unsupported */ BUG(); } @@ -2879,9 +2913,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", - total_errors); - BUG(); + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } return 0; } @@ -3014,14 +3048,21 @@ int btrfs_commit_super(struct btrfs_root *root) if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (ret) + return ret; /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; ret = btrfs_write_and_wait_transaction(NULL, root); - BUG_ON(ret); + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to sync btree inode to disk."); + return ret; + } ret = write_ctree_super(NULL, root, 0); return ret; @@ -3366,8 +3407,8 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) spin_unlock(&root->fs_info->ordered_extent_lock); } -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) +int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; @@ -3376,6 +3417,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; +again: spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { spin_unlock(&delayed_refs->lock); @@ -3397,6 +3439,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); + spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); kfree(head->extent_op); delayed_refs->num_heads--; @@ -3404,8 +3447,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs->num_heads_ready--; list_del_init(&head->cluster); mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; } - spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); @@ -3649,6 +3693,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } +static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state) +{ + struct super_block *sb = page->mapping->host->i_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + btrfs_error(fs_info, -EIO, + "Error occured while writing out btree at %llu", start); + return -EIO; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3656,4 +3711,5 @@ static struct extent_io_ops btree_extent_io_ops = { .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, + .writepage_io_failed_hook = btree_writepage_io_failed_hook, }; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 5f77166fd01..e887ee62b6d 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -193,7 +193,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) if (ret < 0) goto fail; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd6f8ae0a78..4b3f1eedced 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -245,7 +245,7 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -253,13 +253,13 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } kfree(logical); @@ -321,7 +321,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { break; @@ -332,7 +332,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, size = end - start; total_added += size; ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic error */ } return total_added; @@ -474,7 +474,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, int ret = 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); + if (!caching_ctl) + return -ENOMEM; INIT_LIST_HEAD(&caching_ctl->list); mutex_init(&caching_ctl->mutex); @@ -982,7 +983,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; - BUG_ON(ret > 0); + BUG_ON(ret > 0); /* Corruption */ leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, @@ -1008,7 +1009,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, new_size + extra_size, 1); if (ret < 0) return ret; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ btrfs_extend_item(trans, root, path, new_size); @@ -1478,7 +1479,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, err = ret; goto out; } - BUG_ON(ret); + if (ret && !insert) { + err = -ENOENT; + goto out; + } + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1831,6 +1836,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); + /* Error condition is -ENOMEM */ if (!ret) { struct btrfs_bio_stripe *stripe = bbio->stripes; int i; @@ -1846,7 +1852,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, if (!ret) discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) - break; + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ /* * Just in case we get back EOPNOTSUPP for some reason, @@ -1865,6 +1871,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, return ret; } +/* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1940,7 +1947,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); out: btrfs_free_path(path); return err; @@ -2027,6 +2035,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int ret; int err = 0; + if (trans->aborted) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2124,7 +2135,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) { - int ret; + int ret = 0; + + if (trans->aborted) + return 0; + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* @@ -2142,11 +2157,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); - BUG_ON(ret); } } mutex_unlock(&head->mutex); - return 0; + return ret; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -2193,6 +2207,10 @@ again: return NULL; } +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct list_head *cluster) @@ -2281,9 +2299,13 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - BUG_ON(ret); kfree(extent_op); + if (ret) { + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + return ret; + } + goto next; } @@ -2304,11 +2326,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - BUG_ON(ret); btrfs_put_delayed_ref(ref); kfree(extent_op); count++; + + if (ret) { + printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + return ret; + } + next: do_chunk_alloc(trans, root->fs_info->extent_root, 2 * 1024 * 1024, @@ -2343,6 +2370,9 @@ static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, * 0, which means to process everything in the tree at the start * of the run (but not newly added entries), or it can be some target * number you'd like to process. + * + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count) @@ -2358,6 +2388,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long num_refs = 0; int consider_waiting; + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2415,7 +2449,11 @@ again: } ret = run_clustered_refs(trans, root, &cluster); - BUG_ON(ret < 0); + if (ret < 0) { + spin_unlock(&delayed_refs->lock); + btrfs_abort_transaction(trans, root, ret); + return ret; + } count -= min_t(unsigned long, ret, count); @@ -2580,7 +2618,7 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = -ENOENT; if (path->slots[0] == 0) @@ -2734,7 +2772,6 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } return 0; fail: - BUG(); return ret; } @@ -2763,7 +2800,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); if (ret < 0) goto fail; - BUG_ON(ret); + BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -2771,8 +2808,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); fail: - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } return 0; } @@ -2945,7 +2984,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -2972,7 +3012,9 @@ again: last = cache->key.objectid + cache->key.offset; err = write_one_cache_group(trans, root, path, cache); - BUG_ON(err); + if (err) /* File system offline */ + goto out; + btrfs_put_block_group(cache); } @@ -2985,7 +3027,8 @@ again: if (last == 0) { err = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(err); + if (err) /* File system offline */ + goto out; } cache = btrfs_lookup_first_block_group(root->fs_info, last); @@ -3010,20 +3053,21 @@ again: continue; } - btrfs_write_out_cache(root, trans, cache, path); + err = btrfs_write_out_cache(root, trans, cache, path); /* * If we didn't have an error then the cache state is still * NEED_WRITE, so we can set it to WRITTEN. */ - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) cache->disk_cache_state = BTRFS_DC_WRITTEN; last = cache->key.objectid + cache->key.offset; btrfs_put_block_group(cache); } +out: btrfs_free_path(path); - return 0; + return err; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -3407,9 +3451,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, 0, 0, &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } - BUG_ON(!space_info); + BUG_ON(!space_info); /* Logic error */ again: spin_lock(&space_info->lock); @@ -3674,8 +3718,10 @@ again: ret = wait_event_interruptible(space_info->wait, !space_info->flush); /* Must have been interrupted, return */ - if (ret) + if (ret) { + printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__); return -EINTR; + } spin_lock(&space_info->lock); } @@ -3832,8 +3878,9 @@ out: return ret; } -static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) { struct btrfs_block_rsv *block_rsv = NULL; @@ -4200,6 +4247,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -4536,7 +4584,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) - return -1; + return -ENOENT; if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -4639,7 +4687,7 @@ int btrfs_pin_extent(struct btrfs_root *root, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ pin_down_extent(root, cache, bytenr, num_bytes, reserved); @@ -4657,7 +4705,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ /* * pull in the free space cache (if any) so that our pin @@ -4702,6 +4750,7 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; + spin_lock(&space_info->lock); spin_lock(&cache->lock); if (reserve != RESERVE_FREE) { @@ -4774,7 +4823,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (cache) btrfs_put_block_group(cache); cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); + BUG_ON(!cache); /* Logic error */ } len = cache->key.objectid + cache->key.offset - start; @@ -4811,6 +4860,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, u64 end; int ret; + if (trans->aborted) + return 0; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else @@ -4896,7 +4948,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - BUG_ON(ret); + if (ret) + goto abort; btrfs_release_path(path); path->leave_spinning = 1; @@ -4914,10 +4967,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) + goto abort; extent_slot = path->slots[0]; } - } else { + } else if (ret == -ENOENT) { btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " @@ -4927,6 +4981,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)root_objectid, (unsigned long long)owner_objectid, (unsigned long long)owner_offset); + } else { + goto abort; } leaf = path->nodes[0]; @@ -4936,7 +4992,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - BUG_ON(ret < 0); + if (ret < 0) + goto abort; btrfs_release_path(path); path->leave_spinning = 1; @@ -4953,7 +5010,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - BUG_ON(ret); + if (ret < 0) + goto abort; extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -4990,7 +5048,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - BUG_ON(ret); + if (ret) + goto abort; } } else { if (found_extent) { @@ -5007,12 +5066,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - BUG_ON(ret); + if (ret) + goto abort; btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - BUG_ON(ret); + if (ret) + goto abort; } else { invalidate_mapping_pages(info->btree_inode->i_mapping, bytenr >> PAGE_CACHE_SHIFT, @@ -5020,10 +5081,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } ret = update_block_group(trans, root, bytenr, num_bytes, 0); - BUG_ON(ret); + if (ret) + goto abort; } +out: btrfs_free_path(path); return ret; + +abort: + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } /* @@ -5119,7 +5186,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, parent, root->root_key.objectid, btrfs_header_level(buf), BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (!last_ref) @@ -5153,6 +5220,7 @@ out: btrfs_put_block_group(cache); } +/* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int for_cow) @@ -5174,14 +5242,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, num_bytes, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); } return ret; } @@ -5418,6 +5484,7 @@ have_block_group: found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); + BUG_ON(ret < 0); /* -ENOMEM */ if (block_group->cached == BTRFS_CACHE_FINISHED) goto alloc; @@ -5439,7 +5506,7 @@ have_block_group: if (loop > LOOP_FIND_IDEAL) { ret = cache_block_group(block_group, trans, orig_root, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } /* @@ -5712,6 +5779,11 @@ loop: ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_LIMITED); + if (ret < 0) { + btrfs_abort_transaction(trans, + root, ret); + goto out; + } allowed_chunk_alloc = 0; if (ret == 1) done_chunk_alloc = 1; @@ -5740,6 +5812,7 @@ loop: } else if (ins->objectid) { ret = 0; } +out: return ret; } @@ -5806,10 +5879,15 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) + if (empty_size || root->ref_cows) { ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, CHUNK_ALLOC_NO_FORCE); + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -5821,8 +5899,12 @@ again: num_bytes = num_bytes >> 1; num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, + ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes, data, CHUNK_ALLOC_FORCE); + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -5913,7 +5995,10 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -5943,7 +6028,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_free_path(path); ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { + if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); @@ -5974,7 +6059,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); - BUG_ON(ret); + if (ret) { + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], @@ -6004,7 +6092,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_free_path(path); ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { + if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); @@ -6052,28 +6140,28 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!caching_ctl) { BUG_ON(!block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { num_bytes = caching_ctl->progress - start; ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ start = caching_ctl->progress; num_bytes = ins->objectid + ins->offset - caching_ctl->progress; ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } mutex_unlock(&caching_ctl->mutex); @@ -6082,7 +6170,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); - BUG_ON(ret); + BUG_ON(ret); /* logic error */ btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -6218,7 +6306,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); - BUG_ON(IS_ERR(buf)); + BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6230,7 +6318,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); + BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -6245,7 +6333,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, extent_op, for_cow); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } return buf; } @@ -6315,7 +6403,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, &refs, &flags); - BUG_ON(ret); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; BUG_ON(refs == 0); if (wc->stage == DROP_REFERENCE) { @@ -6382,7 +6472,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, eb->start, eb->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + BUG_ON(ret == -ENOMEM); + if (ret) + return ret; BUG_ON(wc->refs[level] == 0); } @@ -6401,12 +6493,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -6478,7 +6570,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, &wc->refs[level - 1], &wc->flags[level - 1]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock(next); + return ret; + } + BUG_ON(wc->refs[level - 1] == 0); *lookup_info = 0; @@ -6547,7 +6643,7 @@ skip: ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); free_extent_buffer(next); @@ -6605,7 +6701,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, eb->start, eb->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + return ret; + } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); @@ -6625,7 +6724,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -6762,7 +6861,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; @@ -6787,7 +6889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out_free; + goto out_end_trans; } WARN_ON(ret > 0); @@ -6807,7 +6909,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->nodes[level]->len, &wc->refs[level], &wc->flags[level]); - BUG_ON(ret); + if (ret < 0) { + err = ret; + goto out_end_trans; + } BUG_ON(wc->refs[level] == 0); if (level == root_item->drop_level) @@ -6858,26 +6963,40 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } if (block_rsv) trans->block_rsv = block_rsv; } } btrfs_release_path(path); - BUG_ON(err); + if (err) + goto out_end_trans; ret = btrfs_del_root(trans, tree_root, &root->root_key); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + goto out_end_trans; + } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_find_last_root(tree_root, root->root_key.objectid, NULL, NULL); - BUG_ON(ret < 0); - if (ret > 0) { + if (ret < 0) { + btrfs_abort_transaction(trans, tree_root, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { /* if we fail to delete the orphan item this time * around, it'll get picked up the next time. * @@ -6895,8 +7014,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); kfree(root); } -out_free: +out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); +out_free: kfree(wc); btrfs_free_path(path); out: @@ -7099,12 +7219,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + } ret = set_block_group_ro(cache, 0); if (!ret) @@ -7567,7 +7691,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ cache->space_info = space_info; spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7576,7 +7700,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) @@ -7658,7 +7782,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); @@ -7668,11 +7792,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, sizeof(cache->item)); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + return ret; + } set_avail_alloc_bits(extent_root->fs_info, type); @@ -7753,7 +7880,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ffa7cc3370c..4c3ce7a0a7a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1244,7 +1244,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) while (index <= end_index) { page = find_get_page(tree->mapping, index); - BUG_ON(!page); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); page_cache_release(page); index++; @@ -1523,7 +1523,7 @@ again: goto out_failed; } } - BUG_ON(ret); + BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); @@ -2200,7 +2200,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) /* Writeback already completed */ if (ret == 0) return 1; - BUG_ON(ret < 0); } if (!uptodate) { @@ -2353,7 +2352,6 @@ error_handled: if (ret == 0) goto error_handled; } - BUG_ON(ret < 0); } if (uptodate) { @@ -2405,6 +2403,10 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } +/* + * Since writes are async, they will only return -ENOMEM. + * Reads can return the full range of I/O error conditions. + */ static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { @@ -2477,7 +2479,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); - BUG_ON(ret < 0); + if (ret < 0) + return ret; bio = NULL; } else { return 0; @@ -2498,10 +2501,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) *bio_ret = bio; - else { + else ret = submit_one_bio(rw, bio, mirror_num, bio_flags); - BUG_ON(ret < 0); - } return ret; } @@ -2525,6 +2526,7 @@ static void set_page_extent_head(struct page *page, unsigned long len) * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) + * XXX JDM: This needs looking at to ensure proper page locking */ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, @@ -2687,6 +2689,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); + BUG_ON(ret == -ENOMEM); nr++; *bio_flags = this_bio_flag; } @@ -2713,10 +2716,8 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); - if (bio) { + if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - BUG_ON(ret < 0); - } return ret; } @@ -2830,7 +2831,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end, &page_started, &nr_written); - BUG_ON(ret); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + goto done; + } /* * delalloc_end is already one less than the total * length, so we don't subtract one from @@ -3141,7 +3146,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) rw = WRITE_SYNC; ret = submit_one_bio(rw, epd->bio, 0, 0); - BUG_ON(ret < 0); + BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } } @@ -3257,10 +3262,8 @@ int extent_readpages(struct extent_io_tree *tree, page_cache_release(page); } BUG_ON(!list_empty(pages)); - if (bio) { - int ret = submit_one_bio(READ, bio, 0, bio_flags); - BUG_ON(ret < 0); - } + if (bio) + return submit_one_bio(READ, bio, 0, bio_flags); return 0; } @@ -4090,7 +4093,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) { err = submit_one_bio(READ, bio, mirror_num, bio_flags); - BUG_ON(err < 0); + if (err) + return err; } if (ret || wait != WAIT_COMPLETE) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 89af104c756..a14dbca5974 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -59,7 +59,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, sizeof(*item)); if (ret < 0) goto out; - BUG_ON(ret); + BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -431,7 +431,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, offset = page_offset(bvec->bv_page) + bvec->bv_offset; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); + BUG_ON(!ordered); /* Logic error */ sums->bytenr = ordered->start; while (bio_index < bio->bi_vcnt) { @@ -450,11 +450,11 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); - BUG_ON(!sums); + BUG_ON(!sums); /* -ENOMEM */ sector_sum = sums->sums; sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); + BUG_ON(!ordered); /* Logic error */ sums->bytenr = ordered->start; } @@ -643,7 +643,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); - BUG_ON(ret && ret != -EAGAIN); + if (ret && ret != -EAGAIN) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } key.offset = end_byte - 1; } else { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0eb80cc4ec8..d83260d7498 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -452,7 +452,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); - BUG_ON(!split || !split2); + BUG_ON(!split || !split2); /* -ENOMEM */ write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -494,7 +494,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ free_extent_map(split); split = split2; split2 = NULL; @@ -520,7 +520,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); + BUG_ON(ret); /* Logic error */ free_extent_map(split); split = NULL; } @@ -679,7 +679,7 @@ next_slot: root->root_key.objectid, new_key.objectid, start - extent_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ *hint_byte = disk_bytenr; } key.offset = start; @@ -754,7 +754,7 @@ next_slot: root->root_key.objectid, key.objectid, key.offset - extent_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); *hint_byte = disk_bytenr; @@ -770,7 +770,10 @@ next_slot: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } del_nr = 0; del_slot = 0; @@ -782,11 +785,13 @@ next_slot: BUG_ON(1); } - if (del_nr > 0) { + if (!ret && del_nr > 0) { ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +out: btrfs_free_path(path); return ret; } @@ -944,7 +949,10 @@ again: btrfs_release_path(path); goto again; } - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, @@ -963,7 +971,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (split == start) { key.offset = start; @@ -990,7 +998,7 @@ again: ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } other_start = 0; other_end = start; @@ -1007,7 +1015,7 @@ again: ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { fi = btrfs_item_ptr(leaf, path->slots[0], @@ -1025,7 +1033,10 @@ again: btrfs_mark_buffer_dirty(leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } out: btrfs_free_path(path); @@ -1666,7 +1677,13 @@ static long btrfs_fallocate(struct file *file, int mode, em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); - BUG_ON(IS_ERR_OR_NULL(em)); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + break; + } last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = (last_byte + mask) & ~mask; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecbcc8711a0..054707ed579 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -230,11 +230,13 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, if (ret) { trans->block_rsv = rsv; - WARN_ON(1); + btrfs_abort_transaction(trans, root, ret); return ret; } ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); trans->block_rsv = rsv; return ret; @@ -1948,14 +1950,14 @@ again: */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - WARN_ON(ret); + WARN_ON(ret); /* -ENOMEM */ goto out; } ret = remove_from_bitmap(ctl, info, &offset, &bytes); if (ret == -EAGAIN) goto again; - BUG_ON(ret); + BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); out: @@ -2346,7 +2348,7 @@ again: rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); - BUG_ON(ret); + BUG_ON(ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * block_group->sectorsize, 1); @@ -2439,7 +2441,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; - BUG_ON(ret); + BUG_ON(ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; @@ -2830,6 +2832,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) int ret; ret = search_bitmap(ctl, entry, &offset, &count); + /* Logic error; Should be empty if it can't find anything */ BUG_ON(ret); ino = offset; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index c0792409bb6..7ec58bd7c50 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -135,6 +135,7 @@ out: return ret; } +/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ee15d88b33d..7ca46e6e11a 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); + BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) break; info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); + BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->cache_progress) goto free; @@ -443,13 +443,13 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { ret = PTR_ERR(inode); goto out_release; } if (IS_ERR(inode)) { - BUG_ON(retry); + BUG_ON(retry); /* Logic error */ retry = true; ret = create_free_ino_inode(root, trans, path); @@ -460,12 +460,17 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); - WARN_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto out_put; + } } spin_lock(&root->cache_lock); @@ -532,7 +537,7 @@ static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41132339e2e..d6420cca9c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -150,7 +150,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, inode_add_bytes(inode, size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); - BUG_ON(ret); if (ret) { err = ret; goto fail; @@ -206,9 +205,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); - return 0; + return ret; fail: btrfs_free_path(path); return err; @@ -250,14 +249,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, ret = btrfs_drop_extents(trans, inode, start, aligned_end, &hint_byte, 1); - BUG_ON(ret); + if (ret) + return ret; if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -293,7 +296,7 @@ static noinline int add_async_extent(struct async_cow *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); + BUG_ON(!async_extent); /* -ENOMEM */ async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -433,7 +436,11 @@ again: cont: if (start == 0) { trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto cleanup_and_out; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ @@ -450,11 +457,11 @@ cont: total_compressed, compress_type, pages); } - if (ret == 0) { + if (ret <= 0) { /* - * inline extent creation worked, we don't need - * to create any more async work items. Unlock - * and free up our temp pages. + * inline extent creation worked or returned error, + * we don't need to create any more async work items. + * Unlock and free up our temp pages. */ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -547,7 +554,7 @@ cleanup_and_bail_uncompressed: } out: - return 0; + return ret; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -557,6 +564,20 @@ free_pages_out: kfree(pages); goto out; + +cleanup_and_out: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + if (!trans || IS_ERR(trans)) + btrfs_error(root->fs_info, ret, "Failed to join transaction"); + else + btrfs_abort_transaction(trans, root, ret); + goto free_pages_out; } /* @@ -606,6 +627,8 @@ retry: async_extent->ram_size - 1, &page_started, &nr_written, 0); + /* JDM XXX */ + /* * if page_started, cow_file_range inserted an * inline extent and took care of all the unlocking @@ -628,14 +651,19 @@ retry: async_extent->start + async_extent->ram_size - 1); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_reserve_extent(trans, root, + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); - btrfs_end_transaction(trans, root); + if (ret) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + } if (ret) { int i; @@ -649,7 +677,9 @@ retry: unlock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - goto retry; + if (ret == -ENOSPC) + goto retry; + goto out_free; /* JDM: Requeue? */ } /* @@ -661,7 +691,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -693,7 +723,7 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * clear dirty, set writeback and unlock the pages. @@ -715,13 +745,17 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } - - return 0; + ret = 0; +out: + return ret; +out_free: + kfree(async_extent); + goto out; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -790,7 +824,18 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(btrfs_is_free_space_inode(root, inode)); trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + return PTR_ERR(trans); + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); @@ -820,8 +865,10 @@ static noinline int cow_file_range(struct inode *inode, *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; - ret = 0; goto out; + } else if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; } } @@ -838,10 +885,13 @@ static noinline int cow_file_range(struct inode *inode, ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); - BUG_ON(ret); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -867,13 +917,16 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_unlock; + } } if (disk_num_bytes < cur_alloc_size) @@ -898,11 +951,23 @@ static noinline int cow_file_range(struct inode *inode, alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } -out: ret = 0; +out: btrfs_end_transaction(trans, root); return ret; +out_unlock: + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + + goto out; } /* @@ -968,7 +1033,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); + BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = inode; async_cow->root = root; async_cow->locked_page = locked_page; @@ -1059,7 +1124,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_bytenr; u64 num_bytes; int extent_type; - int ret; + int ret, err; int type; int nocow; int check_prev = 1; @@ -1077,7 +1142,11 @@ static noinline int run_delalloc_nocow(struct inode *inode, else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; @@ -1085,7 +1154,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, while (1) { ret = btrfs_lookup_file_extent(trans, root, path, ino, cur_offset, 0); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1099,8 +1171,10 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - BUG_ON(1); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } if (ret > 0) break; leaf = path->nodes[0]; @@ -1188,7 +1262,10 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } cow_start = (u64)-1; } @@ -1197,7 +1274,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(); - BUG_ON(!em); + BUG_ON(!em); /* -ENOMEM */ em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; @@ -1223,13 +1300,16 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } } extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -1248,18 +1328,23 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error; + } } +error: if (nolock) { - ret = btrfs_end_transaction_nolock(trans, root); - BUG_ON(ret); + err = btrfs_end_transaction_nolock(trans, root); } else { - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + err = btrfs_end_transaction(trans, root); } + if (!ret) + ret = err; + btrfs_free_path(path); - return 0; + return ret; } /* @@ -1448,7 +1533,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, int ret = 0; ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -1677,13 +1762,15 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, &hint, 0); - BUG_ON(ret); + if (ret) + goto out; ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - BUG_ON(ret); + if (ret) + goto out; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1711,10 +1798,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, btrfs_ino(inode), file_pos, &ins); - BUG_ON(ret); +out: btrfs_free_path(path); - return 0; + return ret; } /* @@ -1742,22 +1829,24 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) end - start + 1); if (!ret) return 0; - BUG_ON(!ordered_extent); + BUG_ON(!ordered_extent); /* Logic error */ nolock = btrfs_is_free_space_inode(root, inode); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); + BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { if (nolock) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); - BUG_ON(ret); + if (ret) /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); } goto out; } @@ -1770,7 +1859,11 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root); else trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_unlock; + } trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -1781,7 +1874,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len); - BUG_ON(ret); } else { BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, @@ -1795,11 +1887,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len); - BUG_ON(ret); } unlock_extent_cached(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state, GFP_NOFS); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); @@ -1807,7 +1902,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { ret = btrfs_update_inode_fallback(trans, root, inode); - BUG_ON(ret); + if (ret) { /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, root, ret); + goto out; + } } ret = 0; out: @@ -1826,6 +1924,11 @@ out: btrfs_put_ordered_extent(ordered_extent); return 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); + goto out; } static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, @@ -1907,6 +2010,8 @@ struct delayed_iput { struct inode *inode; }; +/* JDM: If this is fs-wide, why can't we add a pointer to + * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -2053,20 +2158,27 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); + BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ } /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret && ret != -EEXIST); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + ret = 0; } /* insert an orphan item to track subvolume contains orphan files */ if (insert >= 2) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); - BUG_ON(ret); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } } return 0; } @@ -2096,7 +2208,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) if (trans && delete_item) { ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ } if (release_rsv) @@ -2230,7 +2342,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) } ret = btrfs_del_orphan_item(trans, root, found_key.objectid); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ btrfs_end_transaction(trans, root); continue; } @@ -2612,16 +2724,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, printk(KERN_INFO "btrfs failed to delete reference to %.*s, " "inode %llu parent %llu\n", name_len, name, (unsigned long long)ino, (unsigned long long)dir_ino); + btrfs_abort_transaction(trans, root, ret); goto err; } ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto err; + } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); - BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != 0 && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto err; + } ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); @@ -2779,7 +2897,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = ret; goto out; } - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ if (check_path_shared(root, path)) goto out; btrfs_release_path(path); @@ -2812,7 +2930,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = PTR_ERR(ref); goto out; } - BUG_ON(!ref); + BUG_ON(!ref); /* Logic error */ if (check_path_shared(root, path)) goto out; index = btrfs_inode_ref_index(path->nodes[0], ref); @@ -2919,23 +3037,42 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - BUG_ON(IS_ERR_OR_NULL(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + goto out; + } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_release_path(path); ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, dir_ino, &index, name, name_len); if (ret < 0) { - BUG_ON(ret != -ENOENT); + if (ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); - BUG_ON(IS_ERR_OR_NULL(di)); + if (IS_ERR_OR_NULL(di)) { + if (!di) + ret = -ENOENT; + else + ret = PTR_ERR(di); + btrfs_abort_transaction(trans, root, ret); + goto out; + } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); @@ -2945,15 +3082,19 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } btrfs_i_size_write(dir, dir->i_size - name_len * 2); dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - + if (ret) + btrfs_abort_transaction(trans, root, ret); +out: btrfs_free_path(path); - return 0; + return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -3212,7 +3353,11 @@ delete: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, ret); + goto error; + } pending_del_nr = 0; } btrfs_release_path(path); @@ -3225,8 +3370,10 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, root, ret); } +error: btrfs_free_path(path); return err; } @@ -3373,7 +3520,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset, 0); - BUG_ON(IS_ERR_OR_NULL(em)); + if (IS_ERR(em)) { + err = PTR_ERR(em); + break; + } last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3390,7 +3540,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size, &hint_byte, 1); if (err) { - btrfs_update_inode(trans, root, inode); + btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); break; } @@ -3400,7 +3550,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 0, hole_size, 0, hole_size, 0, 0, 0); if (err) { - btrfs_update_inode(trans, root, inode); + btrfs_abort_transaction(trans, root, err); btrfs_end_transaction(trans, root); break; } @@ -4581,18 +4731,26 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, parent_ino, index); } - if (ret == 0) { - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, - btrfs_inode_type(inode), index); - if (ret) - goto fail_dir_item; + /* Nothing to clean up yet */ + if (ret) + return ret; - btrfs_i_size_write(parent_inode, parent_inode->i_size + - name_len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode, &key, + btrfs_inode_type(inode), index); + if (ret == -EEXIST) + goto fail_dir_item; + else if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; } + + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + if (ret) + btrfs_abort_transaction(trans, root, ret); return ret; fail_dir_item: @@ -4806,7 +4964,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); - BUG_ON(err); + if (err) + goto fail; d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -5137,7 +5296,7 @@ again: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -5252,6 +5411,7 @@ out: free_extent_map(em); return ERR_PTR(err); } + BUG_ON(!em); /* Error is always set */ return em; } @@ -5868,7 +6028,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -7068,7 +7228,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret) ret = btrfs_update_inode(trans, root, old_inode); } - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } if (new_inode) { new_inode->i_ctime = CURRENT_TIME; @@ -7086,11 +7249,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len); } - BUG_ON(ret); - if (new_inode->i_nlink == 0) { + if (!ret && new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, new_dentry->d_inode); BUG_ON(ret); } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } } fixup_inode_flags(new_dir, old_inode); @@ -7098,7 +7264,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { struct dentry *parent = new_dentry->d_parent; @@ -7323,7 +7492,12 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ins.offset, ins.offset, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); @@ -7345,7 +7519,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + if (own_trans) + btrfs_end_transaction(trans, root); + break; + } if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de25e4255ae..20580920071 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -425,13 +425,18 @@ static noinline int create_subvol(struct btrfs_root *root, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(IS_ERR(new_root)); + if (IS_ERR(new_root)) { + btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); + ret = PTR_ERR(new_root); + goto fail; + } btrfs_record_root_in_trans(trans, new_root); ret = btrfs_create_subvol_root(trans, new_root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -439,13 +444,18 @@ static noinline int create_subvol(struct btrfs_root *root, * insert the directory item */ ret = btrfs_set_inode_index(dir, &index); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } ret = btrfs_insert_dir_item(trans, root, name, namelen, dir, &key, BTRFS_FT_DIR, index); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); goto fail; + } btrfs_i_size_write(dir, dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); @@ -1970,7 +1980,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_key.objectid, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, root, ret); + goto out_end_trans; + } btrfs_record_root_in_trans(trans, dest); @@ -1983,11 +1997,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } } - +out_end_trans: ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + if (ret && !err) + err = ret; inode->i_flags |= S_DEAD; out_up_write: up_write(&root->fs_info->subvol_sem); @@ -2451,11 +2470,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset, new_key.offset + datal, &hint_byte, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } leaf = path->nodes[0]; slot = path->slots[0]; @@ -2482,7 +2511,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_ino(inode), new_key.offset - datao, 0); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + goto out; + + } } } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; @@ -2507,11 +2544,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.offset, new_key.offset + datal, &hint_byte, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, + ret); + btrfs_end_transaction(trans, root); + goto out; + } if (skip) { u32 start = @@ -2545,8 +2592,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_i_size_write(inode, endoff); ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); } next: btrfs_release_path(path); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index f8be250963a..24cad1695af 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -58,7 +58,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; - if (ret) { + if (ret) { /* JDM: Really? */ ret = -ENOENT; goto out; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cba7a0bf366..017281dbb2a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4102,10 +4102,11 @@ out: static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; + int ret, err; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -4113,11 +4114,11 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); - BUG_ON(ret); - ret = btrfs_end_transaction(trans, root->fs_info->tree_root); - BUG_ON(ret); - return 0; + err = btrfs_end_transaction(trans, root->fs_info->tree_root); + if (err) + return err; + return ret; } /* @@ -4185,7 +4186,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = ret; goto out; } - mark_garbage_root(reloc_root); + ret = mark_garbage_root(reloc_root); + if (ret < 0) { + err = ret; + goto out; + } } } @@ -4231,14 +4236,19 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); - BUG_ON(IS_ERR(fs_root)); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out_free; + } err = __add_reloc_root(reloc_root); - BUG_ON(err < 0); + BUG_ON(err < 0); /* -ENOMEM or logic error */ fs_root->reloc_root = reloc_root; } - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); + if (err) + goto out_free; merge_reloc_roots(rc); @@ -4248,7 +4258,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(trans)) err = PTR_ERR(trans); else - btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: @@ -4297,6 +4307,8 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); + if (ret) + goto out; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); @@ -4314,6 +4326,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_add_ordered_sum(inode, ordered, sums); } +out: btrfs_put_ordered_extent(ordered); return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 1486cf9de1d..24fb8ce4e07 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -97,8 +97,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); goto out; + } if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); @@ -383,6 +385,8 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, * * For a back ref the root_id is the id of the subvol or snapshot and * ref_id is the id of the tree referencing it. + * + * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, @@ -406,7 +410,11 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_free_path(path); + return ret; + } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 794cbb52f30..0209d8a9ae3 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1505,6 +1505,9 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) struct btrfs_device *device = sdev->dev; struct btrfs_root *root = device->dev_root; + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return -EIO; + gen = root->fs_info->last_trans_committed; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0517bd70b04..9db64165123 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -216,7 +216,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - WARN_ON_ONCE(1); + WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5a4999aa8fe..63f835aa978 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -949,18 +949,19 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, BTRFS_FT_DIR, index); - if (ret) { + if (ret == -EEXIST) { pending->error = -EEXIST; dput(parent); goto fail; - } else if (ret) - goto abort_trans; + } else if (ret) { + goto abort_trans_dput; + } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); ret = btrfs_update_inode(trans, parent_root, parent_inode); if (ret) - goto abort_trans; + goto abort_trans_dput; /* * pull in the delayed directory update @@ -969,8 +970,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * snapshot */ ret = btrfs_run_delayed_items(trans, root); - if (ret) /* Transaction aborted */ + if (ret) { /* Transaction aborted */ + dput(parent); goto fail; + } record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); @@ -986,17 +989,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); - if (ret) - goto abort_trans; + if (ret) { + btrfs_tree_unlock(old); + free_extent_buffer(old); + goto abort_trans_dput; + } btrfs_set_lock_blocking(old); ret = btrfs_copy_root(trans, root, old, &tmp, objectid); - if (ret) - goto abort_trans; - + /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); + if (ret) + goto abort_trans_dput; /* see comments in should_cow_block() */ root->force_cow = 1; @@ -1009,7 +1015,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) - goto abort_trans; + goto abort_trans_dput; /* * insert root back/forward references @@ -1018,14 +1024,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); + dput(parent); if (ret) goto fail; - dput(parent); key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); - if (IS_ERR(pending->snap)) + if (IS_ERR(pending->snap)) { + ret = PTR_ERR(pending->snap); goto abort_trans; + } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) @@ -1037,6 +1045,8 @@ fail: btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; +abort_trans_dput: + dput(parent); abort_trans: btrfs_abort_transaction(trans, root, ret); goto fail; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 37b52b8c672..d017283ae6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1761,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic errors */ } free_extent_buffer(next); continue; @@ -1869,20 +1869,26 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, wret = walk_down_log_tree(trans, log, path, &level, wc); if (wret > 0) break; - if (wret < 0) + if (wret < 0) { ret = wret; + goto out; + } wret = walk_up_log_tree(trans, log, path, &level, wc); if (wret > 0) break; - if (wret < 0) + if (wret < 0) { ret = wret; + goto out; + } } /* was the root node processed? if not, catch it here */ if (path->nodes[orig_level]) { - wc->process_func(log, path->nodes[orig_level], wc, + ret = wc->process_func(log, path->nodes[orig_level], wc, btrfs_header_generation(path->nodes[orig_level])); + if (ret) + goto out; if (wc->free) { struct extent_buffer *next; @@ -1898,10 +1904,11 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM or logic errors */ } } +out: for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { free_extent_buffer(path->nodes[i]); @@ -2043,7 +2050,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * wait for them until later. */ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&root->log_mutex); + goto out; + } btrfs_set_root_node(&log->root_item, log->node); @@ -2074,7 +2085,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { - BUG_ON(ret != -ENOSPC); + if (ret != -ENOSPC) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); @@ -2114,7 +2129,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_set_super_log_root(root->fs_info->super_for_commit, @@ -2323,7 +2342,9 @@ out_unlock: if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; ret = 0; - } + } else if (ret < 0) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_log_trans(root); return err; @@ -2354,7 +2375,8 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, if (ret == -ENOSPC) { root->fs_info->last_trans_log_full_commit = trans->transid; ret = 0; - } + } else if (ret < 0 && ret != -ENOENT) + btrfs_abort_transaction(trans, root, ret); btrfs_end_log_trans(root); return ret; @@ -3166,13 +3188,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) fs_info->log_root_recovering = 1; trans = btrfs_start_transaction(fs_info->tree_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto error; + } wc.trans = trans; wc.pin = 1; ret = walk_log_tree(trans, log_root_tree, &wc); - BUG_ON(ret); + if (ret) { + btrfs_error(fs_info, ret, "Failed to pin buffers while " + "recovering log root tree."); + goto error; + } again: key.objectid = BTRFS_TREE_LOG_OBJECTID; @@ -3181,8 +3210,12 @@ again: while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - if (ret < 0) - break; + + if (ret < 0) { + btrfs_error(fs_info, ret, + "Couldn't find tree log root."); + goto error; + } if (ret > 0) { if (path->slots[0] == 0) break; @@ -3196,14 +3229,24 @@ again: log = btrfs_read_fs_root_no_radix(log_root_tree, &found_key); - BUG_ON(IS_ERR(log)); + if (IS_ERR(log)) { + ret = PTR_ERR(log); + btrfs_error(fs_info, ret, + "Couldn't read tree log root."); + goto error; + } tmp_key.objectid = found_key.offset; tmp_key.type = BTRFS_ROOT_ITEM_KEY; tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); + if (IS_ERR(wc.replay_dest)) { + ret = PTR_ERR(wc.replay_dest); + btrfs_error(fs_info, ret, "Couldn't read target root " + "for tree log recovery."); + goto error; + } wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); @@ -3251,6 +3294,10 @@ again: kfree(log_root_tree); return 0; + +error: + btrfs_free_path(path); + return ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 394bf15ea18..57305e88ea8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -549,10 +549,10 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->num_can_discard--; new_device = kmalloc(sizeof(*new_device), GFP_NOFS); - BUG_ON(!new_device); + BUG_ON(!new_device); /* -ENOMEM */ memcpy(new_device, device, sizeof(*new_device)); new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(device->name && !new_device->name); + BUG_ON(device->name && !new_device->name); /* -ENOMEM */ new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; @@ -1036,8 +1036,10 @@ again: leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + } else { + btrfs_error(root->fs_info, ret, "Slot search failed"); + goto out; } - BUG_ON(ret); if (device->bytes_used > 0) { u64 len = btrfs_dev_extent_length(leaf, extent); @@ -1047,7 +1049,10 @@ again: spin_unlock(&root->fs_info->free_chunk_lock); } ret = btrfs_del_item(trans, root, path); - + if (ret) { + btrfs_error(root->fs_info, ret, + "Failed to remove dev extent item"); + } out: btrfs_free_path(path); return ret; @@ -1117,7 +1122,7 @@ static noinline int find_next_chunk(struct btrfs_root *root, if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { @@ -1161,7 +1166,7 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); @@ -1595,7 +1600,7 @@ next_slot: (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); - BUG_ON(!device); + BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { btrfs_set_device_generation(leaf, dev_item, @@ -1705,7 +1710,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; ret = btrfs_prepare_sprout(root); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } device->fs_devices = root->fs_info->fs_devices; @@ -1743,11 +1748,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { ret = init_first_rw_device(trans, root, device); - BUG_ON(ret); + if (ret) + goto error_trans; ret = btrfs_finish_sprout(trans, root); - BUG_ON(ret); + if (ret) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); + if (ret) + goto error_trans; } /* @@ -1757,17 +1766,31 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_clear_space_info_full(root->fs_info); unlock_chunks(root); - btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + if (ret) /* transaction commit */ + return ret; + ret = btrfs_relocate_sys_chunks(root); - BUG_ON(ret); + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to relocate sys chunks after " + "device initialization. This can be fixed " + "using the \"btrfs balance\" command."); } return ret; + +error_trans: + unlock_chunks(root); + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + kfree(device->name); + kfree(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { @@ -1875,10 +1898,20 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, key.type = BTRFS_CHUNK_ITEM_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - BUG_ON(ret); + if (ret < 0) + goto out; + else if (ret > 0) { /* Logic error or corruption */ + btrfs_error(root->fs_info, -ENOENT, + "Failed lookup while freeing chunk."); + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); - + if (ret < 0) + btrfs_error(root->fs_info, ret, + "Failed to delete chunk item."); +out: btrfs_free_path(path); return ret; } @@ -2040,7 +2073,7 @@ again: ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -3334,7 +3367,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); - BUG_ON(ret); + if (ret) + goto error; for (i = 0; i < map->num_stripes; ++i) { struct btrfs_device *device; @@ -3347,7 +3381,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, info->chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, dev_offset, stripe_size); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto error; + } } kfree(devices_info); @@ -3465,7 +3502,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, chunk_size, stripe_size); - BUG_ON(ret); + if (ret) + return ret; return 0; } @@ -3497,7 +3535,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); - BUG_ON(ret); + if (ret) + return ret; sys_chunk_offset = chunk_offset + chunk_size; @@ -3508,10 +3547,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); - BUG_ON(ret); + if (ret) + goto abort; ret = btrfs_add_device(trans, fs_info->chunk_root, device); - BUG_ON(ret); + if (ret) + goto abort; /* * Modifying chunk tree needs allocating new blocks from both @@ -3521,13 +3562,20 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, */ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, chunk_size, stripe_size); - BUG_ON(ret); + if (ret) + goto abort; ret = __finish_chunk_alloc(trans, extent_root, sys_map, sys_chunk_offset, sys_chunk_size, sys_stripe_size); - BUG_ON(ret); + if (ret) + goto abort; + return 0; + +abort: + btrfs_abort_transaction(trans, root, ret); + return ret; } int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) @@ -3878,7 +3926,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes); buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); - BUG_ON(!buf); + BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { if (devid && map->stripes[i].dev->devid != devid) @@ -4039,7 +4087,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); - BUG_ON(ret); + if (ret) /* -ENOMEM */ + return ret; total_devs = bbio->num_stripes; if (map_length < length) { @@ -4058,7 +4107,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, while (dev_nr < total_devs) { if (dev_nr < total_devs - 1) { bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); + BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; } @@ -4212,7 +4261,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, write_lock(&map_tree->map_tree.lock); ret = add_extent_mapping(&map_tree->map_tree, em); write_unlock(&map_tree->map_tree.lock); - BUG_ON(ret); + BUG_ON(ret); /* Tree corruption */ free_extent_map(em); return 0; -- cgit v1.2.3 From 914b20070b413ca10f832c45a58b2894990f065f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Mar 2012 16:05:50 +0100 Subject: btrfs: Fix busyloop in transaction_kthread() When a filesystem got aborted due do error, transaction_kthread() will busyloop. Fix it by going to sleep in that case as well. Maybe we should just stop transaction_kthread() when filesystem is aborted but that would be more complex. Signed-off-by: Jan Kara --- fs/btrfs/disk-io.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 16a0cada26c..438993e3d83 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1640,8 +1640,10 @@ static int transaction_kthread(void *arg) u64 transid; unsigned long now; unsigned long delay; + bool cannot_commit; do { + cannot_commit = false; delay = HZ * 30; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); @@ -1665,8 +1667,10 @@ static int transaction_kthread(void *arg) /* If the file system is aborted, this will always fail. */ trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + cannot_commit = true; goto sleep; + } if (transid == trans->transid) { btrfs_commit_transaction(trans, root); } else { @@ -1679,7 +1683,8 @@ sleep: if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && - !btrfs_transaction_blocked(root->fs_info)) + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) schedule_timeout(delay); __set_current_state(TASK_RUNNING); } -- cgit v1.2.3 From 285ff5af6ce358e73f53b55c9efadd4335f4c2ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Jan 2012 15:27:45 -0500 Subject: Btrfs: remove the ideal caching code This is a relic from before we had the disk space cache and it was to make bootup times when you had btrfs as root not be so damned slow. Now that we have the disk space cache this isn't a problem anymore and really having this code casues uneeded fragmentation and complexity, so just remove it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 93 +++++--------------------------------------------- 1 file changed, 8 insertions(+), 85 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 37e0a800d34..eccef6c0623 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5260,11 +5260,10 @@ static int get_block_group_index(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_FIND_IDEAL = 0, - LOOP_CACHING_NOWAIT = 1, - LOOP_CACHING_WAIT = 2, - LOOP_ALLOC_CHUNK = 3, - LOOP_NO_EMPTY_SIZE = 4, + LOOP_CACHING_NOWAIT = 0, + LOOP_CACHING_WAIT = 1, + LOOP_ALLOC_CHUNK = 2, + LOOP_NO_EMPTY_SIZE = 3, }; /* @@ -5300,8 +5299,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; - u64 ideal_cache_percent = 0; - u64 ideal_cache_offset = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -5351,7 +5348,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, empty_cluster = 0; if (search_start == hint_byte) { -ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); used_block_group = block_group; @@ -5363,8 +5359,7 @@ ideal_cache: * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, data) && - (block_group->cached != BTRFS_CACHE_NO || - search_start == ideal_cache_offset)) { + block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -5418,44 +5413,12 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - u64 free_percent; - found_uncached_bg = true; ret = cache_block_group(block_group, trans, - orig_root, 1); - if (block_group->cached == BTRFS_CACHE_FINISHED) - goto alloc; - - free_percent = btrfs_block_group_used(&block_group->item); - free_percent *= 100; - free_percent = div64_u64(free_percent, - block_group->key.offset); - free_percent = 100 - free_percent; - if (free_percent > ideal_cache_percent && - likely(!block_group->ro)) { - ideal_cache_offset = block_group->key.objectid; - ideal_cache_percent = free_percent; - } - - /* - * The caching workers are limited to 2 threads, so we - * can queue as much work as we care to. - */ - if (loop > LOOP_FIND_IDEAL) { - ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret); - } - - /* - * If loop is set for cached only, try the next block - * group. - */ - if (loop == LOOP_FIND_IDEAL) - goto loop; + orig_root, 0); + BUG_ON(ret); } -alloc: if (unlikely(block_group->ro)) goto loop; @@ -5661,9 +5624,7 @@ loop: if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; - /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for - * for them to make caching progress. Also - * determine the best possible bg to cache + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching @@ -5673,45 +5634,7 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { - found_uncached_bg = false; - loop++; - if (!ideal_cache_percent) - goto search; - - /* - * 1 of the following 2 things have happened so far - * - * 1) We found an ideal block group for caching that - * is mostly full and will cache quickly, so we might - * as well wait for it. - * - * 2) We searched for cached only and we didn't find - * anything, and we didn't start any caching kthreads - * either, so chances are we will loop through and - * start a couple caching kthreads, and then come back - * around and just wait for them. This will be slower - * because we will have 2 caching kthreads reading at - * the same time when we could have just started one - * and waited for it to get far enough to give us an - * allocation, so go ahead and go to the wait caching - * loop. - */ - loop = LOOP_CACHING_WAIT; - search_start = ideal_cache_offset; - ideal_cache_percent = 0; - goto ideal_cache; - } else if (loop == LOOP_FIND_IDEAL) { - /* - * Didn't find a uncached bg, wait on anything we find - * next. - */ - loop = LOOP_CACHING_WAIT; - goto search; - } - loop++; - if (loop == LOOP_ALLOC_CHUNK) { if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + -- cgit v1.2.3 From 81c9ad237c604adec79fd4d4034264c6669e0ab3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 18 Jan 2012 10:56:06 -0500 Subject: Btrfs: remove search_start and search_end from find_free_extent and callers We have been passing nothing but (u64)-1 to find_free_extent for search_end in all of the callers, so it's completely useless, and we've always been passing 0 in as search_start, so just remove them as function arguments and move search_start into find_free_extent. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/extent-tree.c | 16 ++++------------ fs/btrfs/inode.c | 9 ++++----- 3 files changed, 9 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80b6486fd5e..edccc948e87 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2466,8 +2466,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data); + struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index eccef6c0623..9b7e7682fda 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5277,7 +5277,6 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, - u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, u64 data) { @@ -5286,6 +5285,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; struct btrfs_block_group_cache *used_block_group; + u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; @@ -5569,11 +5569,6 @@ unclustered_alloc: } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } /* move on to the next group */ if (search_start + num_bytes > @@ -5721,12 +5716,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) + struct btrfs_key *ins, u64 data) { bool final_tried = false; int ret; - u64 search_start = 0; data = btrfs_get_alloc_profile(root, data); again: @@ -5741,8 +5734,7 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, - ins, data); + hint_byte, ins, data); if (ret == -ENOSPC) { if (!final_tried) { @@ -6137,7 +6129,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, (u64)-1, &ins, 0); + empty_size, hint, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 892b34785cc..bb268193d85 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -634,8 +634,7 @@ retry: ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, - (u64)-1, &ins, 1); + 0, alloc_hint, &ins, 1); btrfs_end_transaction(trans, root); if (ret) { @@ -838,7 +837,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); + &ins, 1); BUG_ON(ret); em = alloc_extent_map(); @@ -5414,7 +5413,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, - alloc_hint, (u64)-1, &ins, 1); + alloc_hint, &ins, 1); if (ret) { em = ERR_PTR(ret); goto out; @@ -7315,7 +7314,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, (u64)-1, &ins, 1); + 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); -- cgit v1.2.3 From 727011e07cbdf87772fcc1999cccd15cc915eb62 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 6 Aug 2010 13:21:20 -0400 Subject: Btrfs: allow metadata blocks larger than the page size A few years ago the btrfs code to support blocks lager than the page size was disabled to fix a few corner cases in the page cache handling. This fixes the code to properly support large metadata blocks again. Since current kernels will crash early and often with larger metadata blocks, this adds an incompat bit so that older kernels can't mount it. This also does away with different blocksizes for nodes and leaves. You get a single block size for all tree blocks. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 24 ++++++- fs/btrfs/disk-io.c | 196 ++++++++++++++++++++++++-------------------------- fs/btrfs/extent_io.c | 144 ++++++++++++++++--------------------- fs/btrfs/extent_io.h | 12 ++-- fs/btrfs/inode-item.c | 1 + fs/btrfs/volumes.c | 2 +- 6 files changed, 190 insertions(+), 189 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index edccc948e87..85ab1c5844a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -137,6 +137,12 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +/* + * the max metadata block size. This limit is somewhat artificial, + * but the memmove costs go through the roof for larger blocks. + */ +#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 + /* * we can actually store much bigger names, but lets not confuse the rest * of linux @@ -461,6 +467,19 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -468,6 +487,7 @@ struct btrfs_super_block { (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* @@ -1555,14 +1575,14 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->first_page); \ + type *p = page_address(eb->pages[0]); \ p->member = cpu_to_le##bits(val); \ } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 534266fe505..68fc93e18db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -370,8 +370,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && - !verify_parent_transid(io_tree, eb, parent_transid)) + if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) return ret; /* @@ -406,14 +405,11 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) u64 found_start; unsigned long len; struct extent_buffer *eb; - int ret; tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) { - WARN_ON(1); + if (page->private == EXTENT_PAGE_PRIVATE) goto out; - } if (!page->private) { WARN_ON(1); goto out; @@ -421,22 +417,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) len = page->private >> 2; WARN_ON(len == 0); - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { - WARN_ON(1); - goto out; - } - ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, - btrfs_header_generation(eb)); - BUG_ON(ret); - WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + eb = find_extent_buffer(tree, start, len); found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); goto err; } - if (eb->first_page != page) { + if (eb->pages[0] != page) { WARN_ON(1); goto err; } @@ -537,6 +525,41 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, + struct page *page, int max_walk) +{ + struct extent_buffer *eb; + u64 start = page_offset(page); + u64 target = start; + u64 min_start; + + if (start < max_walk) + min_start = 0; + else + min_start = start - max_walk; + + while (start >= min_start) { + eb = find_extent_buffer(tree, start, 0); + if (eb) { + /* + * we found an extent buffer and it contains our page + * horray! + */ + if (eb->start <= target && + eb->start + eb->len > target) + return eb; + + /* we found an extent buffer that wasn't for us */ + free_extent_buffer(eb); + return NULL; + } + if (start == 0) + break; + start -= PAGE_CACHE_SIZE; + } + return NULL; +} + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -547,24 +570,25 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; + int reads_done; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; if (!page->private) goto out; + tree = &BTRFS_I(page->mapping->host)->io_tree; len = page->private >> 2; - WARN_ON(len == 0); - eb = alloc_extent_buffer(tree, start, len, page); - if (eb == NULL) { + eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize)); + if (!eb) { ret = -EIO; goto out; } + reads_done = atomic_dec_and_test(&eb->pages_reading); + if (!reads_done) + goto err; found_start = btrfs_header_bytenr(eb); - if (found_start != start) { + if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", (unsigned long long)found_start, @@ -572,13 +596,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - if (eb->first_page != page) { - printk(KERN_INFO "btrfs bad first page %lu %lu\n", - eb->first_page->index, page->index); - WARN_ON(1); - ret = -EIO; - goto err; - } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", (unsigned long long)eb->start); @@ -606,14 +623,14 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; } - end = min_t(u64, eb->len, PAGE_CACHE_SIZE); - end = eb->start + end - 1; err: if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, ret); } + if (ret && eb) + clear_extent_buffer_uptodate(tree, eb, NULL); free_extent_buffer(eb); out: return ret; @@ -637,7 +654,7 @@ static int btree_io_failed_hook(struct bio *failed_bio, len = page->private >> 2; WARN_ON(len == 0); - eb = alloc_extent_buffer(tree, start, len, page); + eb = alloc_extent_buffer(tree, start, len); if (eb == NULL) goto out; @@ -896,28 +913,14 @@ static int btree_migratepage(struct address_space *mapping, static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct extent_buffer *eb; - int was_dirty; - tree = &BTRFS_I(page->mapping->host)->io_tree; + if (!(current->flags & PF_MEMALLOC)) { return extent_write_full_page(tree, page, btree_get_extent, wbc); } redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); - WARN_ON(!eb); - - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE; - spin_unlock(&root->fs_info->delalloc_lock); - } - free_extent_buffer(eb); - unlock_page(page); return 0; } @@ -954,6 +957,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) { struct extent_io_tree *tree; struct extent_map_tree *map; + struct extent_buffer *eb; + struct btrfs_root *root; int ret; if (PageWriteback(page) || PageDirty(page)) @@ -962,6 +967,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; + root = BTRFS_I(page->mapping->host)->root; + if (page->private == EXTENT_PAGE_PRIVATE) { + eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize)); + free_extent_buffer(eb); + if (eb) + return 0; + } /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -1074,20 +1086,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *eb; eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, NULL); + bytenr, blocksize); return eb; } int btrfs_write_tree_block(struct extent_buffer *buf) { - return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return filemap_fdatawait_range(buf->first_page->mapping, + return filemap_fdatawait_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } @@ -1513,41 +1525,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return 0; } -static int bio_ready_for_csum(struct bio *bio) -{ - u64 length = 0; - u64 buf_len = 0; - u64 start = 0; - struct page *page; - struct extent_io_tree *io_tree = NULL; - struct bio_vec *bvec; - int i; - int ret; - - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - if (page->private == EXTENT_PAGE_PRIVATE) { - length += bvec->bv_len; - continue; - } - if (!page->private) { - length += bvec->bv_len; - continue; - } - length = bvec->bv_len; - buf_len = page->private >> 2; - start = page_offset(page) + bvec->bv_offset; - io_tree = &BTRFS_I(page->mapping->host)->io_tree; - } - /* are we fully contained in this bio? */ - if (buf_len <= length) - return 1; - - ret = extent_range_uptodate(io_tree, start + length, - start + buf_len - 1); - return ret; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -1563,17 +1540,6 @@ static void end_workqueue_fn(struct btrfs_work *work) bio = end_io_wq->bio; fs_info = end_io_wq->info; - /* metadata bio reads are special because the whole tree block must - * be checksummed at once. This makes sure the entire block is in - * ram and up to date before trying to verify things. For - * blocksize <= pagesize, it is basically a noop - */ - if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && - !bio_ready_for_csum(bio)) { - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); - return; - } error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; @@ -2135,10 +2101,38 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + if (btrfs_super_leafsize(disk_super) != + btrfs_super_nodesize(disk_super)) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksizes don't match. node %d leaf %d\n", + btrfs_super_nodesize(disk_super), + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: couldn't mount because metadata " + "blocksize (%d) was too large\n", + btrfs_super_leafsize(disk_super)); + err = -EINVAL; + goto fail_alloc; + } + features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + + /* + * flag our filesystem as having big metadata blocks if + * they are bigger than the page size + */ + if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -3122,7 +3116,7 @@ int close_ctree(struct btrfs_root *root) int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) { int ret; - struct inode *btree_inode = buf->first_page->mapping->host; + struct inode *btree_inode = buf->pages[0]->mapping->host; ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, NULL); @@ -3136,14 +3130,14 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct inode *btree_inode = buf->first_page->mapping->host; + struct inode *btree_inode = buf->pages[0]->mapping->host; return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; @@ -3212,7 +3206,7 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; int ret; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a55fbe6252d..c6c9ce463c8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3548,26 +3548,7 @@ out: inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { - struct page *p; - struct address_space *mapping; - - if (i == 0) - return eb->first_page; - i += eb->start >> PAGE_CACHE_SHIFT; - mapping = eb->first_page->mapping; - if (!mapping) - return NULL; - - /* - * extent_buffer_page is only called after pinning the page - * by increasing the reference count. So we know the page must - * be in the radix tree. - */ - rcu_read_lock(); - p = radix_tree_lookup(&mapping->page_tree, i); - rcu_read_unlock(); - - return p; + return eb->pages[i]; } inline unsigned long num_extent_pages(u64 start, u64 len) @@ -3576,6 +3557,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#if LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + if (eb->pages && eb->pages != eb->inline_pages) + kfree(eb->pages); + kmem_cache_free(extent_buffer_cache, eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -3608,21 +3602,25 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, spin_unlock_irqrestore(&leak_lock, flags); #endif atomic_set(&eb->refs, 1); + atomic_set(&eb->pages_reading, 0); + + if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { + struct page **pages; + int num_pages = (len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + pages = kzalloc(num_pages, mask); + if (!pages) { + __free_extent_buffer(eb); + return NULL; + } + eb->pages = pages; + } else { + eb->pages = eb->inline_pages; + } return eb; } -static void __free_extent_buffer(struct extent_buffer *eb) -{ -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - kmem_cache_free(extent_buffer_cache, eb); -} - /* * Helper for releasing extent buffer page. */ @@ -3632,9 +3630,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; - if (!eb->first_page) - return; - index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) return; @@ -3657,8 +3652,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) } struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0) + u64 start, unsigned long len) { unsigned long num_pages = num_extent_pages(start, len); unsigned long i; @@ -3674,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_page_accessed(eb->pages[0]); return eb; } rcu_read_unlock(); @@ -3683,32 +3677,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (!eb) return NULL; - if (page0) { - eb->first_page = page0; - i = 1; - index++; - page_cache_get(page0); - mark_page_accessed(page0); - set_page_extent_mapped(page0); - set_page_extent_head(page0, len); - uptodate = PageUptodate(page0); - } else { - i = 0; - } - for (; i < num_pages; i++, index++) { + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; } - set_page_extent_mapped(p); mark_page_accessed(p); - if (i == 0) { - eb->first_page = p; - set_page_extent_head(p, len); - } else { - set_page_private(p, EXTENT_PAGE_PRIVATE); - } + eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -3716,8 +3692,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * see below about how we avoid a nasty race with release page * and why we unlock later */ - if (i != 0) - unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -3751,15 +3725,23 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->first_page); - set_page_extent_head(eb->first_page, eb->len); - if (!page0) - unlock_page(eb->first_page); + set_page_extent_mapped(eb->pages[0]); + set_page_extent_head(eb->pages[0], eb->len); + SetPageChecked(eb->pages[0]); + for (i = 1; i < num_pages; i++) { + p = extent_buffer_page(eb, i); + set_page_extent_mapped(p); + ClearPageChecked(p); + unlock_page(p); + } + unlock_page(eb->pages[0]); return eb; free_eb: - if (eb->first_page && !page0) - unlock_page(eb->first_page); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) + unlock_page(eb->pages[i]); + } if (!atomic_dec_and_test(&eb->refs)) return exists; @@ -3776,7 +3758,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->first_page); + mark_page_accessed(eb->pages[0]); return eb; } rcu_read_unlock(); @@ -3981,8 +3963,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int ret = 0; int locked_pages = 0; int all_uptodate = 1; - int inc_all_pages = 0; unsigned long num_pages; + unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; @@ -4014,8 +3996,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + num_reads++; all_uptodate = 0; + } } if (all_uptodate) { if (start_i == 0) @@ -4023,20 +4007,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + atomic_set(&eb->pages_reading, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - - WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); if (i == 0) set_page_extent_head(page, eb->len); - - if (inc_all_pages) - page_cache_get(page); if (!PageUptodate(page)) { - if (start_i == 0) - inc_all_pages = 1; ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, @@ -4304,15 +4281,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, { char *dst_kaddr = page_address(dst_page); char *src_kaddr; + int must_memmove = 0; if (dst_page != src_page) { src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; - BUG_ON(areas_overlap(src_off, dst_off, len)); + if (areas_overlap(src_off, dst_off, len)) + must_memmove = 1; } - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + if (must_memmove) + memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); + else + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, @@ -4382,7 +4364,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (!areas_overlap(src_offset, dst_offset, len)) { + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } @@ -4429,7 +4411,8 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) return ret; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + if (atomic_read(&eb->refs) > 1 || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; goto out; } @@ -4442,7 +4425,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ret = 0; goto out; } - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); out: spin_unlock(&tree->buffer_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cecc3518c12..4e38a3d9631 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -119,16 +119,18 @@ struct extent_state { struct list_head leak_list; }; +#define INLINE_EXTENT_BUFFER_PAGES 16 +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) struct extent_buffer { u64 start; unsigned long len; unsigned long map_start; unsigned long map_len; - struct page *first_page; unsigned long bflags; + atomic_t refs; + atomic_t pages_reading; struct list_head leak_list; struct rcu_head rcu_head; - atomic_t refs; pid_t lock_owner; /* count of read lock holders on the extent buffer */ @@ -152,6 +154,9 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; + wait_queue_head_t lock_wq; + struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; + struct page **pages; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -251,8 +256,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len, - struct page *page0); + u64 start, unsigned long len); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index baa74f3db69..6ea71c60e80 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -19,6 +19,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "print-tree.h" static int find_name_in_backref(struct btrfs_path *path, const char *name, int name_len, struct btrfs_inode_ref **ref_ret) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ef41f285a47..58aad63e1ad 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4384,7 +4384,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) * to silence the warning eg. on PowerPC 64. */ if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->first_page); + SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); -- cgit v1.2.3 From 4f2de97acee6532b36dd6e995b858343771ad126 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 7 Mar 2012 16:20:05 -0500 Subject: Btrfs: set page->private to the eb We spend a lot of time looking up extent buffers from pages when we could just store the pointer to the eb the page is associated with in page->private. This patch does just that, and it makes things a little simpler and reduces a bit of CPU overhead involved with doing metadata IO. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 91 +++++++++++++-------------------------------------- fs/btrfs/extent_io.c | 92 ++++++++++++++++++++++++++++++++++++++-------------- fs/btrfs/extent_io.h | 1 + 3 files changed, 91 insertions(+), 93 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 68fc93e18db..bc88649cffb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -403,39 +403,28 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; - unsigned long len; struct extent_buffer *eb; tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; - if (!page->private) { - WARN_ON(1); - goto out; - } - len = page->private >> 2; - WARN_ON(len == 0); - - eb = find_extent_buffer(tree, start, len); + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return 0; found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); - goto err; + return 0; } if (eb->pages[0] != page) { WARN_ON(1); - goto err; + return 0; } if (!PageUptodate(page)) { WARN_ON(1); - goto err; + return 0; } csum_tree_block(root, eb, 0); -err: - free_extent_buffer(eb); -out: return 0; } @@ -566,7 +555,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_io_tree *tree; u64 found_start; int found_level; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; int ret = 0; @@ -576,13 +564,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto out; tree = &BTRFS_I(page->mapping->host)->io_tree; - len = page->private >> 2; + eb = (struct extent_buffer *)page->private; - eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize)); - if (!eb) { - ret = -EIO; - goto out; - } reads_done = atomic_dec_and_test(&eb->pages_reading); if (!reads_done) goto err; @@ -631,7 +614,6 @@ err: if (ret && eb) clear_extent_buffer_uptodate(tree, eb, NULL); - free_extent_buffer(eb); out: return ret; } @@ -640,31 +622,17 @@ static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, int mirror_num, struct extent_state *state) { - struct extent_io_tree *tree; - unsigned long len; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) - goto out; - if (!page->private) - goto out; - - len = page->private >> 2; - WARN_ON(len == 0); - - eb = alloc_extent_buffer(tree, start, len); - if (eb == NULL) - goto out; + eb = (struct extent_buffer *)page->private; + if (page != eb->pages[0]) + return -EIO; if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, -EIO); } - free_extent_buffer(eb); - -out: return -EIO; /* we fixed nothing */ } @@ -955,10 +923,8 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; struct extent_map_tree *map; - struct extent_buffer *eb; - struct btrfs_root *root; + struct extent_io_tree *tree; int ret; if (PageWriteback(page) || PageDirty(page)) @@ -967,13 +933,6 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; - root = BTRFS_I(page->mapping->host)->root; - if (page->private == EXTENT_PAGE_PRIVATE) { - eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize)); - free_extent_buffer(eb); - if (eb) - return 0; - } /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -985,14 +944,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) if (!ret) return 0; - ret = try_release_extent_buffer(tree, page); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } - - return ret; + return try_release_extent_buffer(tree, page); } static void btree_invalidatepage(struct page *page, unsigned long offset) @@ -3219,17 +3171,21 @@ static int btree_lock_page_hook(struct page *page, void *data, { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; - unsigned long len; - u64 bytenr = page_offset(page); - if (page->private == EXTENT_PAGE_PRIVATE) + /* + * We culled this eb but the page is still hanging out on the mapping, + * carry on. + */ + if (!PagePrivate(page)) goto out; - len = page->private >> 2; - eb = find_extent_buffer(io_tree, bytenr, len); - if (!eb) + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + goto out; + } + if (page != eb->pages[0]) goto out; if (!btrfs_try_tree_write_lock(eb)) { @@ -3248,7 +3204,6 @@ static int btree_lock_page_hook(struct page *page, void *data, } btrfs_tree_unlock(eb); - free_extent_buffer(eb); out: if (!trylock_page(page)) { flush_fn(data); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c6c9ce463c8..0381b6007ae 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2473,19 +2473,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void set_page_extent_mapped(struct page *page) +void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); + set_page_private(page, (unsigned long)eb); + } else { + WARN_ON(page->private != (unsigned long)eb); } } -static void set_page_extent_head(struct page *page, unsigned long len) +void set_page_extent_mapped(struct page *page) { - WARN_ON(!PagePrivate(page)); - set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } } /* @@ -3585,6 +3590,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; + eb->tree = tree; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3637,8 +3643,31 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) + if (page) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ page_cache_release(page); + } } while (index != start_idx); } @@ -3683,6 +3712,32 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, WARN_ON(1); goto free_eb; } + + spin_lock(&mapping->private_lock); + if (PagePrivate(p)) { + /* + * We could have already allocated an eb for this page + * and attached one so lets see if we can get a ref on + * the existing eb, and if we can we know it's good and + * we can just return that one, else we know we can just + * overwrite page->private. + */ + exists = (struct extent_buffer *)p->private; + if (atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + goto free_eb; + } + + /* + * Do this so attach doesn't complain and we need to + * drop the ref the old guy had. + */ + ClearPagePrivate(p); + page_cache_release(p); + } + attach_extent_buffer_page(eb, p); + spin_unlock(&mapping->private_lock); mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) @@ -3705,7 +3760,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - /* add one reference for the caller */ atomic_inc(&exists->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3725,12 +3779,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, * after the extent buffer is in the radix tree so * it doesn't get lost */ - set_page_extent_mapped(eb->pages[0]); - set_page_extent_head(eb->pages[0], eb->len); SetPageChecked(eb->pages[0]); for (i = 1; i < num_pages; i++) { p = extent_buffer_page(eb, i); - set_page_extent_mapped(p); ClearPageChecked(p); unlock_page(p); } @@ -3794,10 +3845,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, lock_page(page); WARN_ON(!PagePrivate(page)); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); - clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); if (!PageDirty(page)) { @@ -4010,9 +4057,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, atomic_set(&eb->pages_reading, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - set_page_extent_mapped(page); - if (i == 0) - set_page_extent_head(page, eb->len); if (!PageUptodate(page)) { ClearPageError(page); err = __extent_read_full_page(tree, page, @@ -4395,22 +4439,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) struct extent_buffer *eb = container_of(head, struct extent_buffer, rcu_head); - btrfs_release_extent_buffer(eb); + __free_extent_buffer(eb); } int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) { u64 start = page_offset(page); - struct extent_buffer *eb; + struct extent_buffer *eb = (struct extent_buffer *)page->private; int ret = 1; - spin_lock(&tree->buffer_lock); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) { - spin_unlock(&tree->buffer_lock); - return ret; - } + if (!PagePrivate(page) || !eb) + return 1; + spin_lock(&tree->buffer_lock); if (atomic_read(&eb->refs) > 1 || test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; @@ -4426,6 +4467,7 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) goto out; } radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); + btrfs_release_extent_buffer_page(eb, 0); out: spin_unlock(&tree->buffer_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4e38a3d9631..83e432da2e2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -127,6 +127,7 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; unsigned long bflags; + struct extent_io_tree *tree; atomic_t refs; atomic_t pages_reading; struct list_head leak_list; -- cgit v1.2.3 From 115391d2315239164e400a8259b26392afccf3bd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Mar 2012 09:51:43 -0500 Subject: Btrfs: only use the existing eb if it's count isn't 0 We can run into a problem where we find an eb for our existing page already on the radix tree but it has a ref count of 0. It hasn't yet been removed by RCU yet so this can cause issues where we will use the EB after free. So do atomic_inc_not_zero on the exists->refs and if it is zero just do synchronize_rcu() and try again. We won't have to worry about new allocators coming in since they will block on the page lock at this point. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0381b6007ae..0f74262911b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3750,7 +3750,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - +again: ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto free_eb; @@ -3760,7 +3760,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (ret == -EEXIST) { exists = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - atomic_inc(&exists->refs); + if (!atomic_inc_not_zero(&exists->refs)) { + spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); + synchronize_rcu(); + exists = NULL; + goto again; + } spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); goto free_eb; -- cgit v1.2.3 From 3083ee2e18b701122a3b841db83448543a87a583 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Mar 2012 16:01:49 -0500 Subject: Btrfs: introduce free_extent_buffer_stale Because btrfs cow's we can end up with extent buffers that are no longer necessary just sitting around in memory. So instead of evicting these pages, we could end up evicting things we actually care about. Thus we have free_extent_buffer_stale for use when we are freeing tree blocks. This will make it so that the ref for the eb being in the radix tree is dropped as soon as possible and then is freed when the refcount hits 0 instead of waiting to be released by releasepage. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 31 ++++++-- fs/btrfs/disk-io.c | 14 +--- fs/btrfs/extent-tree.c | 5 +- fs/btrfs/extent_io.c | 205 +++++++++++++++++++++++++++++++++++++++++-------- fs/btrfs/extent_io.h | 6 +- 5 files changed, 201 insertions(+), 60 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0639a555e16..74c03fb0ca1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - rcu_read_lock(); - eb = rcu_dereference(root->node); - extent_buffer_get(eb); - rcu_read_unlock(); + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -504,7 +517,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; @@ -959,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ - free_extent_buffer(mid); + free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > @@ -1016,7 +1029,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer(right); + free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; @@ -1056,7 +1069,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer(mid); + free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ @@ -3781,7 +3794,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); + extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + free_extent_buffer_stale(leaf); return 0; } /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bc88649cffb..0ba055e03eb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -923,16 +923,8 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_map_tree *map; - struct extent_io_tree *tree; - int ret; - if (PageWriteback(page) || PageDirty(page)) return 0; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -940,11 +932,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) */ gfp_flags &= ~GFP_SLAB_BUG_MASK; - ret = try_release_extent_state(map, tree, page, gfp_flags); - if (!ret) - return 0; - - return try_release_extent_buffer(tree, page); + return try_release_extent_buffer(page, gfp_flags); } static void btree_invalidatepage(struct page *page, unsigned long offset) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9b7e7682fda..1b831ac4c07 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5018,10 +5018,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0); @@ -6022,6 +6018,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0f74262911b..0ce14369920 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3607,6 +3607,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); atomic_set(&eb->pages_reading, 0); @@ -3654,6 +3655,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, */ if (PagePrivate(page) && page->private == (unsigned long)eb) { + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); /* * We need to make sure we haven't be attached * to a new eb. @@ -3763,7 +3766,6 @@ again: if (!atomic_inc_not_zero(&exists->refs)) { spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); - synchronize_rcu(); exists = NULL; goto again; } @@ -3772,7 +3774,10 @@ again: goto free_eb; } /* add one reference for the tree */ + spin_lock(&eb->refs_lock); atomic_inc(&eb->refs); + set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags); + spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3823,15 +3828,143 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +static int extent_buffer_under_io(struct extent_buffer *eb, + struct page *locked_page) +{ + unsigned long num_pages, i; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + int need_unlock = 0; + + if (!page) + continue; + + if (page != locked_page) { + if (!trylock_page(page)) + return 1; + need_unlock = 1; + } + + if (PageDirty(page) || PageWriteback(page)) { + if (need_unlock) + unlock_page(page); + return 1; + } + if (need_unlock) + unlock_page(page); + } + + return 0; +} + +/* Expects to have eb->eb_lock already held */ +static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + struct extent_io_tree *tree = eb->tree; + int ret; + + spin_unlock(&eb->refs_lock); + + might_sleep_if(mask & __GFP_WAIT); + ret = clear_extent_bit(tree, eb->start, + eb->start + eb->len - 1, -1, 0, 0, + NULL, mask); + if (ret < 0) { + unsigned long num_pages, i; + + num_pages = num_extent_pages(eb->start, eb->len); + /* + * We failed to clear the state bits which likely means + * ENOMEM, so just re-up the eb ref and continue, we + * will get freed later on via releasepage or something + * else and will be ok. + */ + spin_lock(&eb->tree->mapping->private_lock); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags); + atomic_inc(&eb->refs); + + /* + * We may have started to reclaim the pages for a newly + * allocated eb, make sure we own all of them again. + */ + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) { + WARN_ON(1); + continue; + } + + BUG_ON(!PagePrivate(page)); + if (page->private != (unsigned long)eb) { + ClearPagePrivate(page); + page_cache_release(page); + attach_extent_buffer_page(eb, page); + } + } + spin_unlock(&eb->refs_lock); + spin_unlock(&eb->tree->mapping->private_lock); + return; + } + + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return; + } + spin_unlock(&eb->refs_lock); +} + void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb, NULL) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb, GFP_ATOMIC); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb, NULL) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb, GFP_NOFS); } int clear_extent_buffer_dirty(struct extent_io_tree *tree, @@ -3874,6 +4007,7 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); for (i = 0; i < num_pages; i++) __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); return was_dirty; @@ -4440,45 +4574,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) -{ - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - __free_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +int try_release_extent_buffer(struct page *page, gfp_t mask) { - u64 start = page_offset(page); - struct extent_buffer *eb = (struct extent_buffer *)page->private; - int ret = 1; + struct extent_buffer *eb; - if (!PagePrivate(page) || !eb) + /* + * We need to make sure noboody is attaching this page to an eb right + * now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); return 1; + } - spin_lock(&tree->buffer_lock); - if (atomic_read(&eb->refs) > 1 || - test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + + /* + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb, page)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; } + spin_unlock(&page->mapping->private_lock); + + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; } - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); - btrfs_release_extent_buffer_page(eb, 0); -out: - spin_unlock(&tree->buffer_lock); + release_extent_buffer(eb, mask); - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 83e432da2e2..60628341f15 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,6 +35,8 @@ #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -128,6 +130,7 @@ struct extent_buffer { unsigned long map_len; unsigned long bflags; struct extent_io_tree *tree; + spinlock_t refs_lock; atomic_t refs; atomic_t pages_reading; struct list_head leak_list; @@ -184,7 +187,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_buffer(struct page *page, gfp_t mask); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -261,6 +264,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -- cgit v1.2.3 From 5df4235ea15bd39f441ef334d8329b3d46b2cc57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 15 Mar 2012 18:24:42 -0400 Subject: Btrfs: introduce mark_extent_buffer_accessed Because an eb can have multiple pages we need to make sure that all pages within the eb are markes as accessed, since releasepage can be called against any page in the eb. This will keep us from possibly evicting hot eb's when we're doing larger than pagesize eb's. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0ce14369920..4a97d8fd958 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3683,6 +3683,17 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } +static void mark_extent_buffer_accessed(struct extent_buffer *eb) +{ + unsigned long num_pages, i; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + mark_page_accessed(p); + } +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len) { @@ -3700,7 +3711,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->pages[0]); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); @@ -3729,6 +3740,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (atomic_inc_not_zero(&exists->refs)) { spin_unlock(&mapping->private_lock); unlock_page(p); + mark_extent_buffer_accessed(exists); goto free_eb; } @@ -3771,6 +3783,7 @@ again: } spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + mark_extent_buffer_accessed(exists); goto free_eb; } /* add one reference for the tree */ @@ -3820,7 +3833,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_page_accessed(eb->pages[0]); + mark_extent_buffer_accessed(eb); return eb; } rcu_read_unlock(); -- cgit v1.2.3 From 0b32f4bbb423f02acee6d43cd442f5f0775db7e0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 13 Mar 2012 09:38:00 -0400 Subject: Btrfs: ensure an entire eb is written at once This patch simplifies how we track our extent buffers. Previously we could exit writepages with only having written half of an extent buffer, which meant we had to track the state of the pages and the state of the extent buffers differently. Now we only read in entire extent buffers and write out entire extent buffers, this allows us to simply set bits in our bflags to indicate the state of the eb and we no longer have to do things like track uptodate with our iotree. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 76 ++++---- fs/btrfs/extent_io.c | 497 +++++++++++++++++++++++++++++++++++---------------- fs/btrfs/extent_io.h | 24 ++- fs/btrfs/inode.c | 2 + 4 files changed, 390 insertions(+), 209 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0ba055e03eb..c54aec87e89 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -333,7 +333,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state, GFP_NOFS); - if (extent_buffer_uptodate(io_tree, eb, cached_state) && + if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; @@ -344,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, (unsigned long long)parent_transid, (unsigned long long)btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(io_tree, eb, &cached_state); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); @@ -566,7 +566,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, tree = &BTRFS_I(page->mapping->host)->io_tree; eb = (struct extent_buffer *)page->private; - reads_done = atomic_dec_and_test(&eb->pages_reading); + /* the pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + extent_buffer_get(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); if (!reads_done) goto err; @@ -606,14 +611,17 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ret = -EIO; } + if (!ret) + set_extent_buffer_uptodate(eb); err: if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, ret); } - if (ret && eb) - clear_extent_buffer_uptodate(tree, eb, NULL); + if (ret) + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); out: return ret; } @@ -878,20 +886,6 @@ static int btree_migratepage(struct address_space *mapping, } #endif -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (!(current->flags & PF_MEMALLOC)) { - return extent_write_full_page(tree, page, - btree_get_extent, wbc); - } - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -911,7 +905,7 @@ static int btree_writepages(struct address_space *mapping, if (num_dirty < thresh) return 0; } - return extent_writepages(tree, mapping, btree_get_extent, wbc); + return btree_write_cache_pages(mapping, wbc); } static int btree_readpage(struct file *file, struct page *page) @@ -950,15 +944,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } +static int btree_set_page_dirty(struct page *page) +{ + struct extent_buffer *eb; + + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); +} + static const struct address_space_operations btree_aops = { .readpage = btree_readpage, - .writepage = btree_writepage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif + .set_page_dirty = btree_set_page_dirty, }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1001,7 +1008,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { free_extent_buffer(buf); return -EIO; - } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + } else if (extent_buffer_uptodate(buf)) { *eb = buf; } else { free_extent_buffer(buf); @@ -1054,9 +1061,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return buf; } @@ -1064,7 +1068,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1080,8 +1083,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + clear_extent_buffer_dirty(buf); } return 0; } @@ -1948,6 +1950,7 @@ int open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; @@ -3058,8 +3061,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int ret; struct inode *btree_inode = buf->pages[0]->mapping->host; - ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, - NULL); + ret = extent_buffer_uptodate(buf); if (!ret) return ret; @@ -3070,16 +3072,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int btrfs_set_buffer_uptodate(struct extent_buffer *buf) { - struct inode *btree_inode = buf->pages[0]->mapping->host; - return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, - buf); + return set_extent_buffer_uptodate(buf); } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; u64 transid = btrfs_header_generation(buf); - struct inode *btree_inode = root->fs_info->btree_inode; int was_dirty; btrfs_assert_tree_locked(buf); @@ -3091,8 +3090,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)root->fs_info->generation); WARN_ON(1); } - was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, - buf); + was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); root->fs_info->dirty_metadata_bytes += buf->len; @@ -3147,11 +3145,7 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - int ret; - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - if (ret == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); - return ret; + return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } static int btree_lock_page_hook(struct page *page, void *data, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4a97d8fd958..c1b898d590d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -19,6 +19,7 @@ #include "btrfs_inode.h" #include "volumes.h" #include "check-integrity.h" +#include "locking.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -53,6 +54,8 @@ struct extent_page_data { unsigned int sync_io:1; }; +static noinline void flush_write_bio(void *data); + int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("extent_state", @@ -2337,7 +2340,7 @@ error_handled: } } - if (uptodate) { + if (uptodate && tree->track_uptodate) { set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } @@ -2973,6 +2976,275 @@ done_unlocked: return 0; } +static int eb_wait(void *word) +{ + io_schedule(); + return 0; +} + +static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, + TASK_UNINTERRUPTIBLE); +} + +static int lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) +{ + unsigned long i, num_pages; + int flush = 0; + int ret = 0; + + if (!btrfs_try_tree_write_lock(eb)) { + flush = 1; + flush_write_bio(epd); + btrfs_tree_lock(eb); + } + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); + if (!epd->sync_io) + return 0; + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + printk(KERN_ERR "Um, ok?\n"); + btrfs_tree_unlock(eb); + return 0; + } + } + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_lock(&fs_info->delalloc_lock); + if (fs_info->dirty_metadata_bytes >= eb->len) + fs_info->dirty_metadata_bytes -= eb->len; + else + WARN_ON(1); + spin_unlock(&fs_info->delalloc_lock); + ret = 1; + } + + btrfs_tree_unlock(eb); + + if (!ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + if (!trylock_page(p)) { + if (!flush) { + flush_write_bio(epd); + flush = 1; + } + lock_page(p); + } + } + + return ret; +} + +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_clear_bit(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + +static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_buffer *eb; + int done; + + do { + struct page *page = bvec->bv_page; + + bvec--; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + done = atomic_dec_and_test(&eb->io_pages); + + if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + ClearPageUptodate(page); + SetPageError(page); + } + + end_page_writeback(page); + + if (!done) + continue; + + end_extent_buffer_writeback(eb); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); + +} + +static int write_one_eb(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct block_device *bdev = fs_info->fs_devices->latest_bdev; + u64 offset = eb->start; + unsigned long i, num_pages; + int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int ret; + + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); + atomic_set(&eb->io_pages, num_pages); + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + + clear_page_dirty_for_io(p); + set_page_writeback(p); + ret = submit_extent_page(rw, eb->tree, p, offset >> 9, + PAGE_CACHE_SIZE, 0, bdev, &epd->bio, + -1, end_bio_extent_buffer_writepage, + 0, 0, 0); + if (ret) { + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + SetPageError(p); + if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) + end_extent_buffer_writeback(eb); + ret = -EIO; + break; + } + offset += PAGE_CACHE_SIZE; + update_nr_written(p, wbc, 1); + unlock_page(p); + } + + if (unlikely(ret)) { + for (; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + unlock_page(p); + } + } + + return ret; +} + +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + struct extent_buffer *eb, *prev_eb = NULL; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, + }; + int ret = 0; + int done = 0; + int nr_to_write_done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + scanned = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!done && !nr_to_write_done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!PagePrivate(page)) + continue; + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + break; + } + + eb = (struct extent_buffer *)page->private; + if (!eb) { + WARN_ON(1); + continue; + } + + if (eb == prev_eb) + continue; + + if (!atomic_inc_not_zero(&eb->refs)) { + WARN_ON(1); + continue; + } + + prev_eb = eb; + ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + if (!ret) { + free_extent_buffer(eb); + continue; + } + + ret = write_one_eb(eb, fs_info, wbc, &epd); + if (ret) { + done = 1; + free_extent_buffer(eb); + break; + } + free_extent_buffer(eb); + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + flush_write_bio(&epd); + return ret; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -3609,7 +3881,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, #endif spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - atomic_set(&eb->pages_reading, 0); + atomic_set(&eb->io_pages, 0); if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { struct page **pages; @@ -3628,6 +3900,13 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return eb; } +static int extent_buffer_under_io(struct extent_buffer *eb) +{ + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} + /* * Helper for releasing extent buffer page. */ @@ -3637,6 +3916,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; struct page *page; + BUG_ON(extent_buffer_under_io(eb)); + index = num_extent_pages(eb->start, eb->len); if (start_idx >= index) return; @@ -3655,6 +3936,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, */ if (PagePrivate(page) && page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(PageDirty(page)); BUG_ON(PageWriteback(page)); /* @@ -3683,10 +3965,41 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) __free_extent_buffer(eb); } +static void check_buffer_tree_ref(struct extent_buffer *eb) +{ + /* the ref bit is tricky. We have to make sure it is set + * if we have the buffer dirty. Otherwise the + * code to free a buffer can end up dropping a dirty + * page + * + * Once the ref bit is set, it won't go away while the + * buffer is dirty or in writeback, and it also won't + * go away while we have the reference count on the + * eb bumped. + * + * We can't just set the ref bit without bumping the + * ref on the eb because free_extent_buffer might + * see the ref bit and try to clear it. If this happens + * free_extent_buffer might end up dropping our original + * ref by mistake and freeing the page before we are able + * to add one more ref. + * + * So bump the ref count first, then set the bit. If someone + * beat us to it, drop the ref we added. + */ + if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + atomic_inc(&eb->refs); + if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + } +} + static void mark_extent_buffer_accessed(struct extent_buffer *eb) { unsigned long num_pages, i; + check_buffer_tree_ref(eb); + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); @@ -3744,15 +4057,17 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, goto free_eb; } - /* + /* * Do this so attach doesn't complain and we need to * drop the ref the old guy had. */ ClearPagePrivate(p); + WARN_ON(PageDirty(p)); page_cache_release(p); } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) @@ -3788,8 +4103,7 @@ again: } /* add one reference for the tree */ spin_lock(&eb->refs_lock); - atomic_inc(&eb->refs); - set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags); + check_buffer_tree_ref(eb); spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3849,90 +4163,15 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) __free_extent_buffer(eb); } -static int extent_buffer_under_io(struct extent_buffer *eb, - struct page *locked_page) -{ - unsigned long num_pages, i; - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; - int need_unlock = 0; - - if (!page) - continue; - - if (page != locked_page) { - if (!trylock_page(page)) - return 1; - need_unlock = 1; - } - - if (PageDirty(page) || PageWriteback(page)) { - if (need_unlock) - unlock_page(page); - return 1; - } - if (need_unlock) - unlock_page(page); - } - - return 0; -} - /* Expects to have eb->eb_lock already held */ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { struct extent_io_tree *tree = eb->tree; - int ret; spin_unlock(&eb->refs_lock); - might_sleep_if(mask & __GFP_WAIT); - ret = clear_extent_bit(tree, eb->start, - eb->start + eb->len - 1, -1, 0, 0, - NULL, mask); - if (ret < 0) { - unsigned long num_pages, i; - - num_pages = num_extent_pages(eb->start, eb->len); - /* - * We failed to clear the state bits which likely means - * ENOMEM, so just re-up the eb ref and continue, we - * will get freed later on via releasepage or something - * else and will be ok. - */ - spin_lock(&eb->tree->mapping->private_lock); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags); - atomic_inc(&eb->refs); - - /* - * We may have started to reclaim the pages for a newly - * allocated eb, make sure we own all of them again. - */ - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; - - if (!page) { - WARN_ON(1); - continue; - } - - BUG_ON(!PagePrivate(page)); - if (page->private != (unsigned long)eb) { - ClearPagePrivate(page); - page_cache_release(page); - attach_extent_buffer_page(eb, page); - } - } - spin_unlock(&eb->refs_lock); - spin_unlock(&eb->tree->mapping->private_lock); - return; - } - spin_lock(&tree->buffer_lock); radix_tree_delete(&tree->buffer, eb->start >> PAGE_CACHE_SHIFT); @@ -3955,7 +4194,7 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && - !extent_buffer_under_io(eb, NULL) && + !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_dec(&eb->refs); @@ -3974,20 +4213,20 @@ void free_extent_buffer_stale(struct extent_buffer *eb) spin_lock(&eb->refs_lock); set_bit(EXTENT_BUFFER_STALE, &eb->bflags); - if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb, NULL) && + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_dec(&eb->refs); release_extent_buffer(eb, GFP_NOFS); } -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int clear_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; struct page *page; num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); @@ -4008,25 +4247,30 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, ClearPageError(page); unlock_page(page); } + WARN_ON(atomic_read(&eb->refs) == 0); return 0; } -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_dirty(struct extent_buffer *eb) { unsigned long i; unsigned long num_pages; int was_dirty = 0; + check_buffer_tree_ref(eb); + was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + num_pages = num_extent_pages(eb->start, eb->len); WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + for (i = 0; i < num_pages; i++) - __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_page_dirty(extent_buffer_page(eb, i)); return was_dirty; } -static int __eb_straddles_pages(u64 start, u64 len) +static int range_straddles_pages(u64 start, u64 len) { if (len < PAGE_CACHE_SIZE) return 1; @@ -4037,25 +4281,14 @@ static int __eb_straddles_pages(u64 start, u64 len) return 0; } -static int eb_straddles_pages(struct extent_buffer *eb) -{ - return __eb_straddles_pages(eb->start, eb->len); -} - -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state) +int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; - num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - + num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -4064,27 +4297,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, return 0; } -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) +int set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; unsigned long num_pages; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); - - if (eb_straddles_pages(eb)) { - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); - } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || - ((i == num_pages - 1) && - ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { - check_page_uptodate(tree, page); - continue; - } SetPageUptodate(page); } return 0; @@ -4099,7 +4321,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - if (__eb_straddles_pages(start, end - start + 1)) { + if (range_straddles_pages(start, end - start + 1)) { ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) @@ -4121,35 +4343,9 @@ int extent_range_uptodate(struct extent_io_tree *tree, return pg_uptodate; } -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state) +int extent_buffer_uptodate(struct extent_buffer *eb) { - int ret = 0; - unsigned long num_pages; - unsigned long i; - struct page *page; - int pg_uptodate = 1; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 1; - - if (eb_straddles_pages(eb)) { - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - pg_uptodate = 0; - break; - } - } - return pg_uptodate; + return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } int read_extent_buffer_pages(struct extent_io_tree *tree, @@ -4171,13 +4367,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (eb_straddles_pages(eb)) { - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; - } - } - if (start) { WARN_ON(start < eb->start); start_i = (start >> PAGE_CACHE_SHIFT) - @@ -4207,7 +4396,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } - atomic_set(&eb->pages_reading, num_reads); + atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (!PageUptodate(page)) { @@ -4235,8 +4424,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ret = -EIO; } - if (!ret) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -4604,13 +4791,13 @@ int try_release_extent_buffer(struct page *page, gfp_t mask) eb = (struct extent_buffer *)page->private; BUG_ON(!eb); - /* + /* * This is a little awful but should be ok, we need to make sure that * the eb doesn't disappear out from under us while we're looking at * this page. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb, page)) { + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); spin_unlock(&page->mapping->private_lock); return 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 60628341f15..489d7945154 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -37,6 +37,8 @@ #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ #define EXTENT_BUFFER_TREE_REF 5 #define EXTENT_BUFFER_STALE 6 +#define EXTENT_BUFFER_WRITEBACK 7 +#define EXTENT_BUFFER_IOERR 8 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -99,6 +101,7 @@ struct extent_io_tree { struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; + int track_uptodate; spinlock_t lock; spinlock_t buffer_lock; struct extent_io_ops *ops; @@ -132,7 +135,7 @@ struct extent_buffer { struct extent_io_tree *tree; spinlock_t refs_lock; atomic_t refs; - atomic_t pages_reading; + atomic_t io_pages; struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; @@ -249,6 +252,8 @@ int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, get_extent_t *get_extent, struct writeback_control *wbc); +int btree_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, struct list_head *pages, unsigned nr_pages, @@ -297,18 +302,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -int clear_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_io_tree *tree, - struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state **cached_state); -int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb, - struct extent_state *cached_state); +int clear_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb268193d85..341a8670165 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6782,6 +6782,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); + ei->io_tree.track_uptodate = 1; + ei->io_failure_tree.track_uptodate = 1; mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); -- cgit v1.2.3 From cfed81a04eb555f5606d1b6a54bdbabab0ee1ac3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 3 Mar 2012 07:40:03 -0500 Subject: Btrfs: add the ability to cache a pointer into the eb This cuts down on the CPU time used by map_private_extent_buffer Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 80 +++++++++++++++++++++++++++++++++++-------------- fs/btrfs/ctree.h | 13 ++++++++ fs/btrfs/struct-funcs.c | 53 +++++++++++++++++++++++++++----- 3 files changed, 116 insertions(+), 30 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 74c03fb0ca1..8af24da983b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2333,6 +2333,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2344,6 +2345,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; + btrfs_init_map_token(&token); + if (empty) nr = 0; else @@ -2421,8 +2424,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space -= btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } left_nritems -= push_items; @@ -2553,6 +2556,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, int wret; u32 this_item_size; u32 old_left_item_size; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -2613,9 +2619,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, item = btrfs_item_nr(left, i); - ioff = btrfs_item_offset(left, item); - btrfs_set_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2645,8 +2652,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space = push_space - btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } btrfs_mark_buffer_dirty(left); @@ -2767,6 +2775,9 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, int ret = 0; int wret; struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2788,8 +2799,9 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); } btrfs_set_header_nritems(l, mid); @@ -3284,6 +3296,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3310,8 +3325,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + size_diff); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); } /* shift the data */ @@ -3381,6 +3397,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -3410,8 +3429,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - data_size); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); } /* shift the data */ @@ -3454,6 +3474,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, unsigned int data_end; struct btrfs_disk_key disk_key; struct btrfs_key found_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); for (i = 0; i < nr; i++) { if (total_size + data_size[i] + sizeof(struct btrfs_item) > @@ -3519,8 +3542,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3547,9 +3571,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); @@ -3588,6 +3613,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, int ret; struct extent_buffer *leaf; int slot; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3619,8 +3647,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3639,9 +3668,10 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); @@ -3814,6 +3844,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3835,8 +3868,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + dsize); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 85ab1c5844a..c2e17cd299b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1546,6 +1546,17 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +static inline void btrfs_init_map_token (struct btrfs_map_token *token) +{ + memset(token, 0, sizeof(*token)); +} + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1569,6 +1580,8 @@ struct btrfs_ioctl_defrag_range_args { #ifndef BTRFS_SETGET_FUNCS #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #endif diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index bc1f6ad1844..c6ffa581241 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -44,8 +44,9 @@ #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ +void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ +u##bits btrfs_token_##name(struct extent_buffer *eb, \ + type *s, struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -54,9 +55,18 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ + unsigned long mem_len = sizeof(((type *)0)->member); \ u##bits res; \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ + kaddr = token->kaddr; \ + p = (type *)(kaddr + part_offset - token->offset); \ + res = le##bits##_to_cpu(p->member); \ + return res; \ + } \ err = map_private_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ + mem_len, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits leres; \ @@ -65,10 +75,15 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ res = le##bits##_to_cpu(p->member); \ + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ return res; \ } \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ +void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ @@ -77,8 +92,17 @@ void btrfs_set_##name(struct extent_buffer *eb, \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ + unsigned long mem_len = sizeof(((type *)0)->member); \ + if (token && token->kaddr && token->offset <= offset && \ + token->eb == eb && \ + (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ + kaddr = token->kaddr; \ + p = (type *)(kaddr + part_offset - token->offset); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ err = map_private_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ + mem_len, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits val2; \ @@ -88,7 +112,22 @@ void btrfs_set_##name(struct extent_buffer *eb, \ } \ p = (type *)(kaddr + part_offset - map_start); \ p->member = cpu_to_le##bits(val); \ -} + if (token) { \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + token->eb = eb; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + btrfs_set_token_##name(eb, s, val, NULL); \ +} \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + return btrfs_token_##name(eb, s, NULL); \ +} \ #include "ctree.h" -- cgit v1.2.3 From a098d8e8eec5a46a47b1bb74390746973d913a9c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 21 Mar 2012 12:09:56 -0400 Subject: Btrfs: loop waiting on writeback lock_extent_buffer_for_io needs to loop around and make sure the writeback bits are not set. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c1b898d590d..b71cc4547d4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3010,12 +3010,12 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, flush_write_bio(epd); flush = 1; } - wait_on_extent_buffer_writeback(eb); - btrfs_tree_lock(eb); - if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { - printk(KERN_ERR "Um, ok?\n"); + while (1) { + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); + if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) + break; btrfs_tree_unlock(eb); - return 0; } } -- cgit v1.2.3 From f7c79f30cb2d3883488e70cafc9e3a7edd4b9fdb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 19 Mar 2012 15:54:38 -0400 Subject: Btrfs: adjust the write_lock_level as we unlock btrfs_search_slot sometimes needs write locks on high levels of the tree. It remembers the highest level that needs a write lock and will use that for all future searches through the tree in a given call. But, very often we'll just cow the top level or the level below and we won't really need write locks on the root again after that. This patch changes things to adjust the write lock requirement as it unlocks levels. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8af24da983b..270655da11d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1395,7 +1395,8 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock) + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) { int i; int skip_level = level; @@ -1427,6 +1428,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level, if (i >= lowest_unlock && i > skip_level && path->locks[i]) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } } } } @@ -1650,6 +1656,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; + int min_write_lock_level; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1677,6 +1684,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; + min_write_lock_level = write_lock_level; + again: /* * we try very hard to do read locks on the root @@ -1808,7 +1817,8 @@ cow_done: goto again; } - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); if (level == lowest_level) { if (dec) @@ -1870,7 +1880,8 @@ cow_done: } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); goto done; } } @@ -4108,7 +4119,7 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); @@ -4119,7 +4130,7 @@ find_next_key: path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); btrfs_clear_path_blocking(path, NULL, 0); } out: @@ -4355,7 +4366,7 @@ again: } ret = 0; done: - unlock_up(path, 0, 1); + unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; if (!old_spinning) btrfs_set_path_blocking(path); -- cgit v1.2.3 From f3f266ab1bfe4770375d24fa8e72a03278e9450a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 23 Mar 2012 10:22:46 -0400 Subject: Btrfs: don't use threaded IO completion helpers for metadata writes The metadata write IO completion code is now simple enough that we don't need the threaded helpers anymore. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c54aec87e89..53c5ea70279 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -840,15 +840,15 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, { int ret; - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - BUG_ON(ret); - if (!(rw & REQ_WRITE)) { + /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } -- cgit v1.2.3 From ea466794084f55d8fcc100711cf17923bf57e962 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Mar 2012 21:57:36 -0400 Subject: Btrfs: deal with read errors on extent buffers differently Since we need to read and write extent buffers in their entirety we can't use the normal bio_readpage_error stuff since it only works on a per page basis. So instead make it so that if we see an io error in endio we just mark the eb as having an IO error and then in btree_read_extent_buffer_pages we will manually try other mirrors and then overwrite the bad mirror if we find a good copy. This works with larger than page size blocks. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 43 +++++++++++++++++++++++++++++-------------- fs/btrfs/extent_io.c | 42 ++++++++++++++++++++++++++++++++---------- fs/btrfs/extent_io.h | 8 +++++--- 3 files changed, 66 insertions(+), 27 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 53c5ea70279..6107b695841 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -360,9 +360,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, u64 start, u64 parent_transid) { struct extent_io_tree *io_tree; + int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; + int failed_mirror = 0; clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; @@ -371,7 +373,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) - return ret; + break; /* * This buffer's crc is fine, but its contents are corrupted, so @@ -379,18 +381,31 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * any less wrong. */ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) - return ret; + break; + + if (!failed_mirror) { + failed = 1; + printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); + failed_mirror = eb->failed_mirror; + } num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) - return ret; + break; mirror_num++; + if (mirror_num == failed_mirror) + mirror_num++; + if (mirror_num > num_copies) - return ret; + break; } - return -EIO; + + if (failed && !ret) + repair_eb_io_failure(root, eb, failed_mirror); + + return ret; } /* @@ -575,6 +590,11 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (!reads_done) goto err; + if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " @@ -626,21 +646,16 @@ out: return ret; } -static int btree_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - int mirror_num, struct extent_state *state) +static int btree_io_failed_hook(struct page *page, int failed_mirror) { struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; eb = (struct extent_buffer *)page->private; - if (page != eb->pages[0]) - return -EIO; - - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = failed_mirror; + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); - } return -EIO; /* we fixed nothing */ } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b71cc4547d4..49a368593a1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1915,6 +1915,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return 0; } +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + u64 start = eb->start; + unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); + int ret; + + for (i = 0; i < num_pages; i++) { + struct page *p = extent_buffer_page(eb, i); + ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + start, p, mirror_num); + if (ret) + break; + start += PAGE_CACHE_SIZE; + } + + return ret; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2261,6 +2281,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; + int failed_mirror; int ret; if (err) @@ -2307,9 +2328,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else clean_io_failure(start, page); } - if (!uptodate) { - int failed_mirror; + + if (!uptodate) failed_mirror = (int)(unsigned long)bio->bi_bdev; + + if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + if (!ret && !err && + test_bit(BIO_UPTODATE, &bio->bi_flags)) + uptodate = 1; + } else if (!uptodate) { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2323,7 +2351,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, page, start, end, failed_mirror, NULL); if (ret == 0) { -error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2331,13 +2358,6 @@ error_handled: uncache_state(&cached); continue; } - if (tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - if (ret == 0) - goto error_handled; - } } if (uptodate && tree->track_uptodate) { @@ -4396,6 +4416,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } + clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + eb->failed_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 489d7945154..38c1af7092f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -58,6 +58,7 @@ #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; +struct btrfs_root; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -73,9 +74,7 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state); + int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, struct extent_state *state); @@ -136,6 +135,7 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; + int failed_mirror; struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; @@ -327,4 +327,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, + int mirror_num); #endif -- cgit v1.2.3 From e565d4b962948e70f37f417603caf131a773621b Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 23 Mar 2012 17:14:20 +0100 Subject: Btrfs: actually call btrfs_init_lockdep btrfs_init_lockdep only makes our lockdep class names look prettier, thus it did never hurt we forgot to actually call it. This turns our lockdep identifier strings from lockdep auto-set #[id] into really pretty "btrfs-fs-01" or "btrfs-csum-03". Signed-off-by: Jan Schmidt --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 61717a4eb14..5239003d453 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1404,6 +1404,8 @@ static int __init init_btrfs_fs(void) if (err) goto unregister_ioctl; + btrfs_init_lockdep(); + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; -- cgit v1.2.3 From 103e976616fe9c2a3e40764c979fa1a592274da2 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 23 Mar 2012 17:24:19 +0100 Subject: Btrfs: check return value of btrfs_cow_block() The two helper functions commit_cowonly_roots() and create_pending_snapshot() failed to check the return value from btrfs_cow_block(), which could at least in theory fail with -ENOSPC from btrfs_alloc_free_block(). This commit adds the missing checks. Signed-off-by: Jan Schmidt --- fs/btrfs/transaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04b77e3ceb7..cd220f28363 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -718,7 +718,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, BUG_ON(ret); eb = btrfs_lock_root_node(fs_info->tree_root); - btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); + BUG_ON(ret); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -949,7 +950,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_flags(new_root_item, root_flags); old = btrfs_lock_root_node(root); - btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + BUG_ON(ret); btrfs_set_lock_blocking(old); btrfs_copy_root(trans, root, old, &tmp, objectid); -- cgit v1.2.3 From 7a3ae2f8c8c8432e65467b7fc84d5deab04061a0 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 23 Mar 2012 17:32:28 +0100 Subject: Btrfs: fix regression in scrub path resolving In commit 4692cf58 we introduced new backref walking code for btrfs. This assumes we're searching live roots, which requires a transaction context. While scrubbing, however, we must not join a transaction because this could deadlock with the commit path. Additionally, what scrub really wants to do is resolving a logical address in the commit root it's currently checking. This patch adds support for logical to path resolving on commit roots and makes scrub use that. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 115 ++++++++++++++++++++++++++++++----------------------- fs/btrfs/backref.h | 5 ++- fs/btrfs/ioctl.c | 4 +- fs/btrfs/scrub.c | 4 +- 4 files changed, 73 insertions(+), 55 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0436c12da8c..56136d9046f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -116,6 +116,7 @@ add_parent: * to a logical address */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + int search_commit_root, struct __prelim_ref *ref, struct ulist *parents) { @@ -131,6 +132,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->search_commit_root = !!search_commit_root; root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -188,6 +190,7 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + int search_commit_root, struct list_head *head) { int err; @@ -212,7 +215,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; - err = __resolve_indirect_ref(fs_info, ref, parents); + err = __resolve_indirect_ref(fs_info, search_commit_root, + ref, parents); if (err) { if (ret == 0) ret = err; @@ -586,6 +590,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; + int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; @@ -600,6 +605,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->search_commit_root = !!search_commit_root; /* * grab both a lock on the path and a lock on the delayed ref head. @@ -614,35 +620,39 @@ again: goto out; BUG_ON(ret == 0); - /* - * look if there are updates for this ref queued and lock the head - */ - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (head) { - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - goto again; - } - ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); - if (ret) { - spin_unlock(&delayed_refs->lock); - goto out; + if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { + /* + * look if there are updates for this ref queued and lock the + * head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + ret = __add_delayed_refs(head, seq, &info_key, + &prefs_delayed); + if (ret) { + spin_unlock(&delayed_refs->lock); + goto out; + } } + spin_unlock(&delayed_refs->lock); } - spin_unlock(&delayed_refs->lock); if (path->slots[0]) { struct extent_buffer *leaf; @@ -679,7 +689,7 @@ again: if (ret) goto out; - ret = __resolve_indirect_refs(fs_info, &prefs); + ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); if (ret) goto out; @@ -1074,8 +1084,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 logical, +static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, u64 orig_extent_item_objectid, u64 extent_item_pos, u64 root, iterate_extent_inodes_t *iterate, void *ctx) @@ -1143,35 +1152,38 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, * calls iterate() for every inode that references the extent identified by * the given parameters. * when the iterator function returns a non-zero value, iteration stops. - * path is guaranteed to be in released state when iterate() is called. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 extent_item_objectid, u64 extent_item_pos, + int search_commit_root, iterate_extent_inodes_t *iterate, void *ctx) { int ret; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); struct btrfs_trans_handle *trans; - struct ulist *refs; - struct ulist *roots; + struct ulist *refs = NULL; + struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; struct seq_list seq_elem; - struct btrfs_delayed_ref_root *delayed_refs; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); + if (search_commit_root) { + trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; + } else { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + btrfs_get_delayed_seq(delayed_refs, &seq_elem); + spin_unlock(&delayed_refs->lock); + } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, extent_item_pos, seq_elem.seq, @@ -1188,7 +1200,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, while (!ret && (root_node = ulist_next(roots, root_node))) { pr_debug("root %llu references leaf %llu\n", root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, path, ref_node->val, + ret = iterate_leaf_refs(fs_info, ref_node->val, extent_item_objectid, extent_item_pos, root_node->val, iterate, ctx); @@ -1198,8 +1210,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ulist_free(refs); ulist_free(roots); out: - btrfs_put_delayed_seq(delayed_refs, &seq_elem); - btrfs_end_transaction(trans, fs_info->extent_root); + if (!search_commit_root) { + btrfs_put_delayed_seq(delayed_refs, &seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); + } + return ret; } @@ -1210,6 +1225,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int ret; u64 extent_item_pos; struct btrfs_key found_key; + int search_commit_root = path->search_commit_root; ret = extent_from_logical(fs_info, logical, path, &found_key); @@ -1220,8 +1236,9 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, return ret; extent_item_pos = logical - found_key.objectid; - ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_item_pos, iterate, ctx); + ret = iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, search_commit_root, + iterate, ctx); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d00dfa9ca93..57ea2e959e4 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -22,6 +22,8 @@ #include "ioctl.h" #include "ulist.h" +#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) + struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -44,9 +46,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 extent_item_objectid, - u64 extent_offset, + u64 extent_offset, int search_commit_root, iterate_extent_inodes_t *iterate, void *ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 05446f77f99..013c6371e3e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3067,8 +3067,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; extent_item_pos = loi->logical - key.objectid; - ret = iterate_extent_inodes(root->fs_info, path, key.objectid, - extent_item_pos, build_ino_list, + ret = iterate_extent_inodes(root->fs_info, key.objectid, + extent_item_pos, 0, build_ino_list, inodes); if (ret < 0) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index abc0fbffa51..b9b84cdfc35 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -352,8 +352,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, } while (ret != 1); } else { swarn.path = path; - iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_item_pos, + iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, 1, scrub_print_warning_inode, &swarn); } -- cgit v1.2.3 From e3176ca2769e420f64eba4b093bbddea6d7a89c3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:16 +0300 Subject: Btrfs: stop silently switching single chunks to raid0 on balance This has been causing a lot of confusion for quite a while now and a lot of users were surprised by this (some of them were even stuck in a ENOSPC situation which they couldn't easily get out of). The addition of restriper gives users a clear choice between raid0 and drive concat setup so there's absolutely no excuse for us to keep doing this. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1b831ac4c07..4269777f185 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6941,7 +6941,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; - return flags; } else { /* they already had raid on here, just return */ if (flags & stripped) @@ -6954,9 +6953,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DUP) return stripped | BTRFS_BLOCK_GROUP_RAID1; - /* turn single device chunks into raid0 */ - return stripped | BTRFS_BLOCK_GROUP_RAID0; + /* this is drive concat, leave it alone */ } + return flags; } -- cgit v1.2.3 From 899c81eac890bcfa5f3636f4c43f68e8393ac1f8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:16 +0300 Subject: Btrfs: add wrappers for working with alloc profiles Add functions to abstract the conversion between chunk and extended allocation profile formats and switch everybody to use them. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 15 +++++++++++++++ fs/btrfs/extent-tree.c | 25 +++++++------------------ fs/btrfs/volumes.c | 20 ++++++++------------ 3 files changed, 30 insertions(+), 30 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c2e17cd299b..aba7832a228 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -849,6 +849,21 @@ struct btrfs_csum_item { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline u64 chunk_to_extended(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline u64 extended_to_chunk(u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} + struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4269777f185..9f16fdb463c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3098,11 +3098,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; @@ -3181,9 +3178,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) } out: - /* extended -> chunk profile */ - flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return flags; + return extended_to_chunk(flags); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) @@ -6914,11 +6909,8 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; } - if (tgt) { - /* extended -> chunk profile */ - tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - return tgt; - } + if (tgt) + return extended_to_chunk(tgt); } /* @@ -7597,11 +7589,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; - - /* chunk -> extended profile */ - if (extra_flags == 0) - extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 58aad63e1ad..4b263a2009b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2250,15 +2250,13 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ -static int chunk_profiles_filter(u64 chunk_profile, +static int chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { - chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; - if (chunk_profile == 0) - chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - if (bargs->profiles & chunk_profile) + if (bargs->profiles & chunk_type) return 0; return 1; @@ -2365,18 +2363,16 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } -static int chunk_soft_convert_filter(u64 chunk_profile, +static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return 0; - chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; - - if (chunk_profile == 0) - chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + chunk_type = chunk_to_extended(chunk_type) & + BTRFS_EXTENDED_PROFILE_MASK; - if (bargs->target & chunk_profile) + if (bargs->target == chunk_type) return 1; return 0; -- cgit v1.2.3 From e8920a640be5d4ebe3fee0670639a81d4ffc904c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: make profile_is_valid() check more strict "0" is a valid value for an on-disk chunk profile, but it is not a valid extended profile. (We have a separate bit for single chunks in extended case) Also rename it to alloc_profile_is_valid() for clarity. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 21 +++++++++++++-------- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/volumes.c | 6 +++--- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aba7832a228..f057e92df39 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2735,22 +2735,27 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info); } /** - * profile_is_valid - tests whether a given profile is valid and reduced + * alloc_profile_is_valid - see if a given profile is valid and reduced * @flags: profile to validate * @extended: if true @flags is treated as an extended profile */ -static inline int profile_is_valid(u64 flags, int extended) +static inline int alloc_profile_is_valid(u64 flags, int extended) { - u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : + BTRFS_BLOCK_GROUP_PROFILE_MASK); flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; - if (extended) - mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (flags & mask) + /* 1) check that all other bits are zeroed */ + if (flags & ~mask) return 0; - /* true if zero or exactly one bit set */ - return (flags & (~flags + 1)) == flags; + + /* 2) see if profile is reduced */ + if (flags == 0) + return !extended; /* "0" is valid for usual profiles */ + + /* true if exactly one bit set */ + return (flags & (flags - 1)) == 0; } /* root-item.c */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9f16fdb463c..8c5bd8fa824 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3400,7 +3400,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - BUG_ON(!profile_is_valid(flags, 0)); + BUG_ON(!alloc_profile_is_valid(flags, 0)); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4b263a2009b..e4ef0f2fdb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2669,7 +2669,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10); - if (!profile_is_valid(bctl->data.target, 1) || + if (!alloc_profile_is_valid(bctl->data.target, 1) || bctl->data.target & ~allowed) { printk(KERN_ERR "btrfs: unable to start balance with target " "data profile %llu\n", @@ -2677,7 +2677,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, ret = -EINVAL; goto out; } - if (!profile_is_valid(bctl->meta.target, 1) || + if (!alloc_profile_is_valid(bctl->meta.target, 1) || bctl->meta.target & ~allowed) { printk(KERN_ERR "btrfs: unable to start balance with target " "metadata profile %llu\n", @@ -2685,7 +2685,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, ret = -EINVAL; goto out; } - if (!profile_is_valid(bctl->sys.target, 1) || + if (!alloc_profile_is_valid(bctl->sys.target, 1) || bctl->sys.target & ~allowed) { printk(KERN_ERR "btrfs: unable to start balance with target " "system profile %llu\n", -- cgit v1.2.3 From 0c460c0d70e10463e44bdf1d406e9c5ec03b1af6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: move alloc_profile_is_valid() to volumes.c Header file is not a good place to define functions. This also moves a call to alloc_profile_is_valid() down the stack and removes a redundant check from __btrfs_alloc_chunk() - alloc_profile_is_valid() takes it into account. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 23 ----------------------- fs/btrfs/extent-tree.c | 2 -- fs/btrfs/volumes.c | 30 +++++++++++++++++++++++++----- 3 files changed, 25 insertions(+), 30 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f057e92df39..a56e1e00105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2734,29 +2734,6 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } -/** - * alloc_profile_is_valid - see if a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile - */ -static inline int alloc_profile_is_valid(u64 flags, int extended) -{ - u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : - BTRFS_BLOCK_GROUP_PROFILE_MASK); - - flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; - - /* 1) check that all other bits are zeroed */ - if (flags & ~mask) - return 0; - - /* 2) see if profile is reduced */ - if (flags == 0) - return !extended; /* "0" is valid for usual profiles */ - - /* true if exactly one bit set */ - return (flags & (flags - 1)) == 0; -} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8c5bd8fa824..304710cae65 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3400,8 +3400,6 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - BUG_ON(!alloc_profile_is_valid(flags, 0)); - space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e4ef0f2fdb7..def9e25f87f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2598,6 +2598,30 @@ error: return ret; } +/** + * alloc_profile_is_valid - see if a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static int alloc_profile_is_valid(u64 flags, int extended) +{ + u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : + BTRFS_BLOCK_GROUP_PROFILE_MASK); + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + + /* 1) check that all other bits are zeroed */ + if (flags & ~mask) + return 0; + + /* 2) see if profile is reduced */ + if (flags == 0) + return !extended; /* "0" is valid for usual profiles */ + + /* true if exactly one bit set */ + return (flags & (flags - 1)) == 0; +} + static inline int balance_need_close(struct btrfs_fs_info *fs_info) { /* cancel requested || normal exit path */ @@ -3124,11 +3148,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int i; int j; - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } + BUG_ON(!alloc_profile_is_valid(type, 0)); if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; -- cgit v1.2.3 From fc67c450837ec034174060a25889a55eed741a1d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: add get_restripe_target() helper Add get_restripe_target() helper and switch everybody to use it. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 94 +++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 44 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 304710cae65..faf52e03031 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3109,6 +3109,35 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) fs_info->avail_system_alloc_bits |= extra_flags; } +/* + * returns target flags in extended format or 0 if restripe for this + * chunk_type is not in progress + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) && + !spin_is_locked(&fs_info->balance_lock)); + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + /* * @flags: available profiles in extended format (see ctree.h) * @@ -3125,31 +3154,19 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) */ u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + u64 target; - /* pick restriper's target profile if it's available */ + /* + * see if restripe for this chunk_type is in progress, if so + * try to reduce to the target profile + */ spin_lock(&root->fs_info->balance_lock); - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && - (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->data.target)) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && - (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->sys.target)) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && - (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (flags & bctl->meta.target)) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - if (tgt) { + target = get_restripe_target(root->fs_info, flags); + if (target) { + /* pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { spin_unlock(&root->fs_info->balance_lock); - flags = tgt; - goto out; + return extended_to_chunk(target); } } spin_unlock(&root->fs_info->balance_lock); @@ -3177,7 +3194,6 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) flags &= ~BTRFS_BLOCK_GROUP_RAID0; } -out: return extended_to_chunk(flags); } @@ -6888,28 +6904,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; - u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - - if (root->fs_info->balance_ctl) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - u64 tgt = 0; - - /* pick restriper's target profile and return */ - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } + u64 stripped; - if (tgt) - return extended_to_chunk(tgt); - } + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(root->fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); /* * we add in the count of missing devices because we want @@ -6919,6 +6922,9 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; -- cgit v1.2.3 From 7738a53a3a3aa8d82350280ff4bc7df9c3094123 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: add __get_block_group_index() helper Add __get_block_group_index() helper to be able to derive block group index from an arbitary set of flags. Implement get_block_group_index() in terms of it. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index faf52e03031..c44aa968078 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5248,22 +5248,29 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +static int __get_block_group_index(u64 flags) { int index; - if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + + if (flags & BTRFS_BLOCK_GROUP_RAID10) index = 0; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + else if (flags & BTRFS_BLOCK_GROUP_RAID1) index = 1; - else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + else if (flags & BTRFS_BLOCK_GROUP_DUP) index = 2; - else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + else if (flags & BTRFS_BLOCK_GROUP_RAID0) index = 3; else index = 4; + return index; } +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + return __get_block_group_index(cache->flags); +} + enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, -- cgit v1.2.3 From 4a5e98f5d61f698452e564e0cde34c16a6b65752 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: improve the logic in btrfs_can_relocate() Currently if we don't have enough space allocated we go ahead and loop though devices in the hopes of finding enough space for a chunk of the *same* type as the one we are trying to relocate. The problem with that is that if we are trying to restripe the chunk its target type can be more relaxed than the current one (eg require less devices or less space). So, when restriping, run checks against the target profile instead of the current one. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c44aa968078..9454045f091 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7136,6 +7136,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; + u64 target; int index; int full = 0; int ret = 0; @@ -7176,13 +7177,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) /* * ok we don't have enough space, but maybe we have free space on our * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. However, if we - * were marked as full, then we know there aren't enough chunks, and we - * can just return. + * alloc devices and guess if we have enough space. if this block + * group is going to be restriped, run checks against the target + * profile instead of the current one. */ ret = -1; - if (full) - goto out; /* * index: @@ -7192,7 +7191,20 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * 3: raid0 * 4: single */ - index = get_block_group_index(block_group); + target = get_restripe_target(root->fs_info, block_group->flags); + if (target) { + index = __get_block_group_index(extended_to_chunk(target)); + } else { + /* + * this is just a balance, so if we were marked as full + * we know there is no space for a new chunk + */ + if (full) + goto out; + + index = get_block_group_index(block_group); + } + if (index == 0) { dev_min = 4; /* Divide by 2 */ -- cgit v1.2.3 From 6728b198deb02c187b5e5a99eb7d1cc9c8bc65e9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: validate target profiles only if we are going to use them Do not run sanity checks on all target profiles unless they all will be used. This came up because alloc_profile_is_valid() is now more strict than it used to be. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index def9e25f87f..28addea5b0f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2676,14 +2676,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } - /* - * Profile changing sanity checks. Skip them if a simple - * balance is requested. - */ - if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & - BTRFS_BALANCE_ARGS_CONVERT)) - goto do_balance; - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (fs_info->fs_devices->num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; @@ -2693,24 +2685,27 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10); - if (!alloc_profile_is_valid(bctl->data.target, 1) || - bctl->data.target & ~allowed) { + if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->data.target, 1) || + (bctl->data.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "data profile %llu\n", (unsigned long long)bctl->data.target); ret = -EINVAL; goto out; } - if (!alloc_profile_is_valid(bctl->meta.target, 1) || - bctl->meta.target & ~allowed) { + if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->meta.target, 1) || + (bctl->meta.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "metadata profile %llu\n", (unsigned long long)bctl->meta.target); ret = -EINVAL; goto out; } - if (!alloc_profile_is_valid(bctl->sys.target, 1) || - bctl->sys.target & ~allowed) { + if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl->sys.target, 1) || + (bctl->sys.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "system profile %llu\n", (unsigned long long)bctl->sys.target); @@ -2718,7 +2713,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { printk(KERN_ERR "btrfs: dup for data is not allowed\n"); ret = -EINVAL; goto out; @@ -2744,7 +2740,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } -do_balance: ret = insert_balance_item(fs_info->tree_root, bctl); if (ret && ret != -EEXIST) goto out; -- cgit v1.2.3 From e4837f8f3b5d08b8c708533a71439bfb40ede467 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:17 +0300 Subject: Btrfs: allow dup for data chunks in mixed mode Generally we don't allow dup for data, but mixed chunks are special and people seem to think this has its use cases. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 28addea5b0f..bcc0acda869 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2650,6 +2650,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, { struct btrfs_fs_info *fs_info = bctl->fs_info; u64 allowed; + int mixed = 0; int ret; if (btrfs_fs_closing(fs_info) || @@ -2659,13 +2660,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + /* * In case of mixed groups both data and meta should be picked, * and identical options should be given for both of them. */ - allowed = btrfs_super_incompat_flags(fs_info->super_copy); - if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; + if (mixed && (bctl->flags & allowed)) { if (!(bctl->flags & BTRFS_BALANCE_DATA) || !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { @@ -2713,7 +2717,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + /* allow dup'ed data chunks only in mixed mode */ + if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { printk(KERN_ERR "btrfs: dup for data is not allowed\n"); ret = -EINVAL; -- cgit v1.2.3 From 5eb56d2520fe16f00756ccdf8eebc277398e0f44 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:18 +0300 Subject: Btrfs: fix memory leak in resolver code init_ipath() allocates btrfs_data_container which is never freed. Free it in free_ipath() and nuke the comment for init_data_container() - we can safely free it with kfree(). Signed-off-by: Ilya Dryomov --- fs/btrfs/backref.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0436c12da8c..4c79547f4a0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1342,12 +1342,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) inode_to_path, ipath); } -/* - * allocates space to return multiple file system paths for an inode. - * total_bytes to allocate are passed, note that space usable for actual path - * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. - */ struct btrfs_data_container *init_data_container(u32 total_bytes) { struct btrfs_data_container *data; @@ -1403,5 +1397,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + kfree(ipath->fspath); kfree(ipath); } -- cgit v1.2.3 From 213e64da90d14537cd63f7090d6c4d1fcc75d9f8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 27 Mar 2012 17:09:18 +0300 Subject: Btrfs: fix infinite loop in btrfs_shrink_device() If relocate of block group 0 fails with ENOSPC we end up infinitely looping because key.offset -= 1 statement in that case brings us back to where we started. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcc0acda869..be2d4e0e6cf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2987,7 +2987,7 @@ again: key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; - while (1) { + do { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto done; @@ -3029,8 +3029,7 @@ again: goto done; if (ret == -ENOSPC) failed++; - key.offset -= 1; - } + } while (key.offset-- > 0); if (failed && !retried) { failed = 0; -- cgit v1.2.3 From 94598ba8d8ff066115508fb99e593d2de1ca67e1 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Mar 2012 14:21:26 -0400 Subject: Btrfs: introduce common define for max number of mirrors Readahead already has a define for the max number of mirrors. Scrub needs such a define now, the rest of the code will need something like this soon. Therefore the define was added to ctree.h and removed from the readahead code. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/reada.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c2e17cd299b..f7da8a8d13c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAX_MIRRORS 2 + #define BTRFS_MAX_LEVEL 8 #define BTRFS_COMPAT_EXTENT_TREE_V0 diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 22db04550f6..dc5d33146fd 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -54,7 +54,6 @@ * than the 2 started one after another. */ -#define MAX_MIRRORS 2 #define MAX_IN_FLIGHT 6 struct reada_extctl { @@ -71,7 +70,7 @@ struct reada_extent { struct list_head extctl; struct kref refcnt; spinlock_t lock; - struct reada_zone *zones[MAX_MIRRORS]; + struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; struct btrfs_device *scheduled_for; }; @@ -84,7 +83,8 @@ struct reada_zone { spinlock_t lock; int locked; struct btrfs_device *device; - struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl + * self */ int ndevs; struct kref refcnt; }; @@ -365,9 +365,9 @@ again: if (ret || !bbio || length < blocksize) goto error; - if (bbio->num_stripes > MAX_MIRRORS) { + if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", MAX_MIRRORS); + "supported", BTRFS_MAX_MIRRORS); goto error; } -- cgit v1.2.3 From 1623edebee317855c6a854366c01d1630cc537c9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Mar 2012 14:21:26 -0400 Subject: Btrfs: minor cleanup in scrub Just a minor cleanup commit in preparation for the big block changes. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 61 +++++++++++++++++++++++--------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index abc0fbffa51..e68bab4ffcd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -36,28 +36,11 @@ * Future enhancements: * - In case an unrepairable extent is encountered, track which files are * affected and report them - * - In case of a read error on files with nodatasum, map the file and read - * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space */ -struct scrub_bio; -struct scrub_page; struct scrub_dev; -static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_checksum(struct btrfs_work *work); -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer); -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer); -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); -static int scrub_fixup_check(struct scrub_bio *sbio, int ix); -static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page); -static void scrub_fixup(struct scrub_bio *sbio, int ix); #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ @@ -124,6 +107,21 @@ struct scrub_warning { int scratch_bufsize; }; +static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_checksum(struct btrfs_work *work); +static int scrub_checksum_data(struct scrub_dev *sdev, + struct scrub_page *spag, void *buffer); +static int scrub_checksum_tree_block(struct scrub_dev *sdev, + struct scrub_page *spag, u64 logical, + void *buffer); +static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); +static int scrub_fixup_check(struct scrub_bio *sbio, int ix); +static void scrub_fixup_end_io(struct bio *bio, int err); +static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, + struct page *page); +static void scrub_fixup(struct scrub_bio *sbio, int ix); + + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -342,7 +340,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING "%s at logical %llu on dev %s, " + printk(KERN_WARNING + "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " "%llu\n", errstr, swarn.logical, dev->name, (unsigned long long)swarn.sector, @@ -948,12 +947,12 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) return fail; } -static int scrub_submit(struct scrub_dev *sdev) +static void scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; if (sdev->curr == -1) - return 0; + return; sbio = sdev->bios[sdev->curr]; sbio->err = 0; @@ -961,8 +960,6 @@ static int scrub_submit(struct scrub_dev *sdev) atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); - - return 0; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -1008,9 +1005,7 @@ again: sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - ret = scrub_submit(sdev); - if (ret) - return ret; + scrub_submit(sdev); goto again; } sbio->spag[sbio->count].flags = flags; @@ -1025,9 +1020,7 @@ again: ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); if (!ret) { __free_page(page); - ret = scrub_submit(sdev); - if (ret) - return ret; + scrub_submit(sdev); goto again; } @@ -1036,13 +1029,8 @@ again: memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); } ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) { - int ret; - - ret = scrub_submit(sdev); - if (ret) - return ret; - } + if (sbio->count == SCRUB_PAGES_PER_BIO || force) + scrub_submit(sdev); return 0; } @@ -1520,7 +1508,7 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, @@ -1741,6 +1729,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) return 0; } + int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) { struct btrfs_fs_info *fs_info = root->fs_info; -- cgit v1.2.3 From b5d67f64f9bc656970dacba245410f0faedad18e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 27 Mar 2012 14:21:27 -0400 Subject: Btrfs: change scrub to support big blocks Scrub used to be coded for nodesize == leafsize == sectorsize == PAGE_SIZE. This is now changed to support sizes for nodesize and leafsize which are N * PAGE_SIZE. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 1353 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 1013 insertions(+), 340 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e68bab4ffcd..5221e072bb6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -40,16 +40,26 @@ * - add a mode to also read unallocated space */ +struct scrub_block; struct scrub_dev; #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ +#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_page { + struct scrub_block *sblock; + struct page *page; + struct block_device *bdev; u64 flags; /* extent flags */ u64 generation; - int mirror_num; - int have_csum; + u64 logical; + u64 physical; + struct { + unsigned int mirror_num:8; + unsigned int have_csum:1; + unsigned int io_error:1; + }; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -60,12 +70,25 @@ struct scrub_bio { int err; u64 logical; u64 physical; - struct scrub_page spag[SCRUB_PAGES_PER_BIO]; - u64 count; + struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; + int page_count; int next_free; struct btrfs_work work; }; +struct scrub_block { + struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; + int page_count; + atomic_t outstanding_pages; + atomic_t ref_count; /* free mem on transition to zero */ + struct scrub_dev *sdev; + struct { + unsigned int header_error:1; + unsigned int checksum_error:1; + unsigned int no_io_error_seen:1; + }; +}; + struct scrub_dev { struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; struct btrfs_device *dev; @@ -79,6 +102,10 @@ struct scrub_dev { struct list_head csum_list; atomic_t cancel_req; int readonly; + int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ + u32 sectorsize; + u32 nodesize; + u32 leafsize; /* * statistics */ @@ -107,19 +134,41 @@ struct scrub_warning { int scratch_bufsize; }; + +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblock); +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size); +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size); +static void scrub_complete_bio_end_io(struct bio *bio, int err); +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write); +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write); +static int scrub_checksum_data(struct scrub_block *sblock); +static int scrub_checksum_tree_block(struct scrub_block *sblock); +static int scrub_checksum_super(struct scrub_block *sblock); +static void scrub_block_get(struct scrub_block *sblock); +static void scrub_block_put(struct scrub_block *sblock); +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage); +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force); static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_checksum(struct btrfs_work *work); -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer); -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer); -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); -static int scrub_fixup_check(struct scrub_bio *sbio, int ix); -static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page); -static void scrub_fixup(struct scrub_bio *sbio, int ix); +static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_block_complete(struct scrub_block *sblock); static void scrub_free_csums(struct scrub_dev *sdev) @@ -133,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } -static void scrub_free_bio(struct bio *bio) -{ - int i; - struct page *last_page = NULL; - - if (!bio) - return; - - for (i = 0; i < bio->bi_vcnt; ++i) { - if (bio->bi_io_vec[i].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[i].bv_page; - __free_page(last_page); - } - bio_put(bio); -} - static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; @@ -157,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) if (!sdev) return; + /* this can happen when scrub is cancelled */ + if (sdev->curr != -1) { + struct scrub_bio *sbio = sdev->bios[sdev->curr]; + + for (i = 0; i < sbio->page_count; i++) { + BUG_ON(!sbio->pagev[i]); + BUG_ON(!sbio->pagev[i]->page); + scrub_block_put(sbio->pagev[i]->sblock); + } + bio_put(sbio->bio); + } + for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; if (!sbio) break; - - scrub_free_bio(sbio->bio); kfree(sbio); } @@ -177,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) struct scrub_dev *sdev; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + int pages_per_bio; + pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, + bio_get_nr_vecs(dev->bdev)); sdev = kzalloc(sizeof(*sdev), GFP_NOFS); if (!sdev) goto nomem; sdev->dev = dev; + sdev->pages_per_bio = pages_per_bio; + sdev->curr = -1; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio; @@ -192,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sbio->index = i; sbio->sdev = sdev; - sbio->count = 0; - sbio->work.func = scrub_checksum; + sbio->page_count = 0; + sbio->work.func = scrub_bio_end_io_worker; if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -201,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->bios[i]->next_free = -1; } sdev->first_free = 0; - sdev->curr = -1; + sdev->nodesize = dev->dev_root->nodesize; + sdev->leafsize = dev->dev_root->leafsize; + sdev->sectorsize = dev->dev_root->sectorsize; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); @@ -292,10 +341,9 @@ err: return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, - int ix) +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) { - struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_device *dev = sblock->sdev->dev; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; struct btrfs_path *path; struct btrfs_key found_key; @@ -314,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; - swarn.logical = sbio->logical + ix * PAGE_SIZE; + BUG_ON(sblock->page_count < 1); + swarn.sector = (sblock->pagev[0].physical) >> 9; + swarn.logical = sblock->pagev[0].logical; swarn.errstr = errstr; swarn.dev = dev; swarn.msg_bufsize = bufsize; @@ -530,9 +579,9 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR "btrfs: unable to fixup " - "(nodatasum) error at logical %llu\n", - fixup->logical); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", + (unsigned long long)fixup->logical, sdev->dev->name); } btrfs_free_path(path); @@ -549,91 +598,168 @@ out: } /* - * scrub_recheck_error gets called when either verification of the page - * failed or the bio failed to read, e.g. with EIO. In the latter case, - * recheck_error gets called for every page in the bio, even though only - * one may be bad + * scrub_handle_errored_block gets called when either verification of the + * pages failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all pages in the bio, even though only one + * may be bad. + * The goal of this function is to repair the errored block by using the + * contents of one of the mirrors. */ -static int scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { - struct scrub_dev *sdev = sbio->sdev; - u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + struct scrub_dev *sdev = sblock_to_check->sdev; + struct btrfs_fs_info *fs_info; + u64 length; + u64 logical; + u64 generation; + unsigned int failed_mirror_index; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + struct scrub_block *sblock_bad; + int ret; + int mirror_index; + int page_num; + int success; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); + DEFAULT_RATELIMIT_BURST); + + BUG_ON(sblock_to_check->page_count < 1); + fs_info = sdev->dev->dev_root->fs_info; + length = sblock_to_check->page_count * PAGE_SIZE; + logical = sblock_to_check->pagev[0].logical; + generation = sblock_to_check->pagev[0].generation; + BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); + failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; + is_metadata = !(sblock_to_check->pagev[0].flags & + BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock_to_check->pagev[0].have_csum; + csum = sblock_to_check->pagev[0].csum; - if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, - sbio->bio->bi_io_vec[ix].bv_page) == 0) { - if (scrub_fixup_check(sbio, ix) == 0) - return 0; - } - if (__ratelimit(&_rs)) - scrub_print_warning("i/o error", sbio, ix); - } else { - if (__ratelimit(&_rs)) - scrub_print_warning("checksum error", sbio, ix); + /* + * read all mirrors one after the other. This includes to + * re-read the extent or metadata block that failed (that was + * the cause that this fixup code is called) another time, + * page by page this time in order to know which pages + * caused I/O errors and which ones are good (for all mirrors). + * It is the goal to handle the situation when more than one + * mirror contains I/O errors, but the errors do not + * overlap, i.e. the data can be repaired by selecting the + * pages from those mirrors without I/O error on the + * particular pages. One example (with blocks >= 2 * PAGE_SIZE) + * would be that mirror #1 has an I/O error on the first page, + * the second page is good, and mirror #2 has an I/O error on + * the second page, but the first page is good. + * Then the first page of the first mirror can be repaired by + * taking the first page of the second mirror, and the + * second page of the second mirror can be repaired by + * copying the contents of the 2nd page of the 1st mirror. + * One more note: if the pages of one mirror contain I/O + * errors, the checksum cannot be verified. In order to get + * the best data for repairing, the first attempt is to find + * a mirror without I/O errors and with a validated checksum. + * Only if this is not possible, the pages are picked from + * mirrors with I/O errors without considering the checksum. + * If the latter is the case, at the end, the checksum of the + * repaired area is verified in order to correctly maintain + * the statistics. + */ + + sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * + sizeof(*sblocks_for_recheck), + GFP_NOFS); + if (!sblocks_for_recheck) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; } - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); + /* setup the context, map the logical blocks and alloc the pages */ + ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, + logical, sblocks_for_recheck); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } + BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); + sblock_bad = sblocks_for_recheck + failed_mirror_index; - scrub_fixup(sbio, ix); - return 1; -} + /* build and submit the bios for the failed mirror, check checksums */ + ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, + csum, generation, sdev->csum_size); + if (ret) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + goto out; + } -static int scrub_fixup_check(struct scrub_bio *sbio, int ix) -{ - int ret = 1; - struct page *page; - void *buffer; - u64 flags = sbio->spag[ix].flags; + if (!sblock_bad->header_error && !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) { + /* + * the error disappeared after reading page by page, or + * the area was part of a huge bio and other parts of the + * bio caused I/O errors, or the block layer merged several + * read requests into one and the error is caused by a + * different bio (usually one of the two latter cases is + * the cause) + */ + spin_lock(&sdev->stat_lock); + sdev->stat.unverified_errors++; + spin_unlock(&sdev->stat_lock); - page = sbio->bio->bi_io_vec[ix].bv_page; - buffer = kmap_atomic(page, KM_USER0); - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sbio->sdev, - sbio->spag + ix, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sbio->sdev, - sbio->spag + ix, - sbio->logical + ix * PAGE_SIZE, - buffer); - } else { - WARN_ON(1); + goto out; } - kunmap_atomic(buffer, KM_USER0); - return ret; -} + if (!sblock_bad->no_io_error_seen) { + spin_lock(&sdev->stat_lock); + sdev->stat.read_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sblock_to_check); + } else if (sblock_bad->checksum_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.csum_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sblock_to_check); + } else if (sblock_bad->header_error) { + spin_lock(&sdev->stat_lock); + sdev->stat.verify_errors++; + spin_unlock(&sdev->stat_lock); + if (__ratelimit(&_rs)) + scrub_print_warning("checksum/header error", + sblock_to_check); + } -static void scrub_fixup_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} + if (sdev->readonly) + goto did_not_correct_error; + + if (!is_metadata && !have_csum) { + struct scrub_fixup_nodatasum *fixup_nodatasum; -static void scrub_fixup(struct scrub_bio *sbio, int ix) -{ - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_bio *bbio = NULL; - struct scrub_fixup_nodatasum *fixup; - u64 logical = sbio->logical + ix * PAGE_SIZE; - u64 length; - int i; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); - - if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && - (sbio->spag[ix].have_csum == 0)) { - fixup = kzalloc(sizeof(*fixup), GFP_NOFS); - if (!fixup) - goto uncorrectable; - fixup->sdev = sdev; - fixup->logical = logical; - fixup->root = fs_info->extent_root; - fixup->mirror_num = sbio->spag[ix].mirror_num; + /* + * !is_metadata and !have_csum, this means that the data + * might not be COW'ed, that it might be modified + * concurrently. The general strategy to work on the + * commit root does not help in the case when COW is not + * used. + */ + fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); + if (!fixup_nodatasum) + goto did_not_correct_error; + fixup_nodatasum->sdev = sdev; + fixup_nodatasum->logical = logical; + fixup_nodatasum->root = fs_info->extent_root; + fixup_nodatasum->mirror_num = failed_mirror_index + 1; /* * increment scrubs_running to prevent cancel requests from * completing as long as a fixup worker is running. we must also @@ -648,235 +774,529 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); atomic_inc(&sdev->fixup_cnt); - fixup->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); - return; + fixup_nodatasum->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, + &fixup_nodatasum->work); + goto out; } - length = PAGE_SIZE; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &bbio, 0); - if (ret || !bbio || length < PAGE_SIZE) { - printk(KERN_ERR - "scrub_fixup: btrfs_map_block failed us for %llu\n", - (unsigned long long)logical); - WARN_ON(1); - kfree(bbio); - return; + /* + * now build and submit the bios for the other mirrors, check + * checksums + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (mirror_index == failed_mirror_index) + continue; + + /* build and submit the bios, check checksums */ + ret = scrub_recheck_block(fs_info, + sblocks_for_recheck + mirror_index, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (ret) + goto did_not_correct_error; } - if (bbio->num_stripes == 1) - /* there aren't any replicas */ - goto uncorrectable; + /* + * first try to pick the mirror which is completely without I/O + * errors and also does not have a checksum error. + * If one is found, and if a checksum is present, the full block + * that is known to contain an error is rewritten. Afterwards + * the block is known to be corrected. + * If a mirror is found which is completely correct, and no + * checksum is present, only those pages are rewritten that had + * an I/O error in the block to be repaired, since it cannot be + * determined, which copy of the other pages is better (and it + * could happen otherwise that a correct page would be + * overwritten by a bad one). + */ + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + + if (!sblock_other->header_error && + !sblock_other->checksum_error && + sblock_other->no_io_error_seen) { + int force_write = is_metadata || have_csum; + + ret = scrub_repair_block_from_good_copy(sblock_bad, + sblock_other, + force_write); + if (0 == ret) + goto corrected_error; + } + } /* - * first find a good copy + * in case of I/O errors in the area that is supposed to be + * repaired, continue by picking good copies of those pages. + * Select the good pages from mirrors to rewrite bad pages from + * the area to fix. Afterwards verify the checksum of the block + * that is supposed to be repaired. This verification step is + * only done for the purpose of statistic counting and for the + * final scrub report, whether errors remain. + * A perfect algorithm could make use of the checksum and try + * all possible combinations of pages from the different mirrors + * until the checksum verification succeeds. For example, when + * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * of mirror #2 is readable but the final checksum test fails, + * then the 2nd page of mirror #3 could be tried, whether now + * the final checksum succeedes. But this would be a rare + * exception and is therefore not implemented. At least it is + * avoided that the good copy is overwritten. + * A more useful improvement would be to pick the sectors + * without I/O error based on sector sizes (512 bytes on legacy + * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * mirror could be repaired by taking 512 byte of a different + * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * area are unreadable. */ - for (i = 0; i < bbio->num_stripes; ++i) { - if (i + 1 == sbio->spag[ix].mirror_num) - continue; - if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, - bbio->stripes[i].physical >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, this is not a good copy */ + /* can only fix I/O errors from here on */ + if (sblock_bad->no_io_error_seen) + goto did_not_correct_error; + + success = 1; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + + if (!page_bad->io_error) continue; + + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + struct scrub_block *sblock_other = sblocks_for_recheck + + mirror_index; + struct scrub_page *page_other = sblock_other->pagev + + page_num; + + if (!page_other->io_error) { + ret = scrub_repair_page_from_good_copy( + sblock_bad, sblock_other, page_num, 0); + if (0 == ret) { + page_bad->io_error = 0; + break; /* succeeded for this page */ + } + } } - if (scrub_fixup_check(sbio, ix) == 0) - break; + if (page_bad->io_error) { + /* did not find a mirror to copy the page from */ + success = 0; + } } - if (i == bbio->num_stripes) - goto uncorrectable; - if (!sdev->readonly) { - /* - * bi_io_vec[ix].bv_page now contains good data, write it back - */ - if (scrub_fixup_io(WRITE, sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, - sbio->bio->bi_io_vec[ix].bv_page)) { - /* I/O-error, writeback failed, give up */ - goto uncorrectable; + if (success) { + if (is_metadata || have_csum) { + /* + * need to verify the checksum now that all + * sectors on disk are repaired (the write + * request for data to be repaired is on its way). + * Just be lazy and use scrub_recheck_block() + * which re-reads the data before the checksum + * is verified, but most likely the data comes out + * of the page cache. + */ + ret = scrub_recheck_block(fs_info, sblock_bad, + is_metadata, have_csum, csum, + generation, sdev->csum_size); + if (!ret && !sblock_bad->header_error && + !sblock_bad->checksum_error && + sblock_bad->no_io_error_seen) + goto corrected_error; + else + goto did_not_correct_error; + } else { +corrected_error: + spin_lock(&sdev->stat_lock); + sdev->stat.corrected_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: fixed up error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } + } else { +did_not_correct_error: + spin_lock(&sdev->stat_lock); + sdev->stat.uncorrectable_errors++; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR + "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", + (unsigned long long)logical, sdev->dev->name); } - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); +out: + if (sblocks_for_recheck) { + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; + mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck + + mirror_index; + int page_index; + + for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; + page_index++) + if (sblock->pagev[page_index].page) + __free_page( + sblock->pagev[page_index].page); + } + kfree(sblocks_for_recheck); + } - printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", - (unsigned long long)logical); - return; + return 0; +} -uncorrectable: - kfree(bbio); - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); +static int scrub_setup_recheck_block(struct scrub_dev *sdev, + struct btrfs_mapping_tree *map_tree, + u64 length, u64 logical, + struct scrub_block *sblocks_for_recheck) +{ + int page_index; + int mirror_index; + int ret; + + /* + * note: the three members sdev, ref_count and outstanding_pages + * are not used (and not set) in the blocks that are used for + * the recheck procedure + */ + + page_index = 0; + while (length > 0) { + u64 sublen = min_t(u64, length, PAGE_SIZE); + u64 mapped_length = sublen; + struct btrfs_bio *bbio = NULL; - printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " - "logical %llu\n", (unsigned long long)logical); + /* + * with a length of PAGE_SIZE, each returned stripe + * represents one mirror + */ + ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, + &bbio, 0); + if (ret || !bbio || mapped_length < sublen) { + kfree(bbio); + return -EIO; + } + + BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); + for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + mirror_index++) { + struct scrub_block *sblock; + struct scrub_page *page; + + if (mirror_index >= BTRFS_MAX_MIRRORS) + continue; + + sblock = sblocks_for_recheck + mirror_index; + page = sblock->pagev + page_index; + page->logical = logical; + page->physical = bbio->stripes[mirror_index].physical; + page->bdev = bbio->stripes[mirror_index].dev->bdev; + page->mirror_num = mirror_index + 1; + page->page = alloc_page(GFP_NOFS); + if (!page->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; + } + sblock->page_count++; + } + kfree(bbio); + length -= sublen; + logical += sublen; + page_index++; + } + + return 0; } -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, - struct page *page) +/* + * this function will check the on disk data for checksum errors, header + * errors and read I/O errors. If any I/O errors happen, the exact pages + * which are errored are marked as being bad. The goal is to enable scrub + * to take those pages that are not errored from all the mirrors so that + * the pages that are errored in the just handled mirror can be repaired. + */ +static int scrub_recheck_block(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, int is_metadata, + int have_csum, u8 *csum, u64 generation, + u16 csum_size) { - struct bio *bio = NULL; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); + int page_num; - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = bdev; - bio->bi_sector = sector; - bio_add_page(bio, page, PAGE_SIZE, 0); - bio->bi_end_io = scrub_fixup_end_io; - bio->bi_private = &complete; - btrfsic_submit_bio(rw, bio); + sblock->no_io_error_seen = 1; + sblock->header_error = 0; + sblock->checksum_error = 0; - /* this will also unplug the queue */ - wait_for_completion(&complete); + for (page_num = 0; page_num < sblock->page_count; page_num++) { + struct bio *bio; + int ret; + struct scrub_page *page = sblock->pagev + page_num; + DECLARE_COMPLETION_ONSTACK(complete); + + BUG_ON(!page->page); + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = page->bdev; + bio->bi_sector = page->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; + } + btrfsic_submit_bio(READ, bio); - ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); - bio_put(bio); - return ret; + /* this will also unplug the queue */ + wait_for_completion(&complete); + + page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + sblock->no_io_error_seen = 0; + bio_put(bio); + } + + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + csum_size); + + return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, + struct scrub_block *sblock, + int is_metadata, int have_csum, + const u8 *csum, u64 generation, + u16 csum_size) { - struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + int page_num; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + struct btrfs_root *root = fs_info->extent_root; + void *mapped_buffer; + + BUG_ON(!sblock->pagev[0].page); + if (is_metadata) { + struct btrfs_header *h; + + mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); + h = (struct btrfs_header *)mapped_buffer; + + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || + generation != le64_to_cpu(h->generation) || + memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE)) + sblock->header_error = 1; + csum = h->csum; + } else { + if (!have_csum) + return; - sbio->err = err; - sbio->bio = bio; + mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); + } - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + for (page_num = 0;;) { + if (page_num == 0 && is_metadata) + crc = btrfs_csum_data(root, + ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, + crc, PAGE_SIZE - BTRFS_CSUM_SIZE); + else + crc = btrfs_csum_data(root, mapped_buffer, crc, + PAGE_SIZE); + + kunmap_atomic(mapped_buffer, KM_USER0); + page_num++; + if (page_num >= sblock->page_count) + break; + BUG_ON(!sblock->pagev[page_num].page); + + mapped_buffer = kmap_atomic(sblock->pagev[page_num].page, + KM_USER0); + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, csum, csum_size)) + sblock->checksum_error = 1; } -static void scrub_checksum(struct btrfs_work *work) +static void scrub_complete_bio_end_io(struct bio *bio, int err) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; - struct page *page; - void *buffer; - int i; - u64 flags; - u64 logical; - int ret; + complete((struct completion *)bio->bi_private); +} - if (sbio->err) { - ret = 0; - for (i = 0; i < sbio->count; ++i) - ret |= scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int force_write) +{ + int page_num; + int ret = 0; - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; + for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + int ret_sub; - for (i = 0; i < sbio->count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - goto out; + ret_sub = scrub_repair_page_from_good_copy(sblock_bad, + sblock_good, + page_num, + force_write); + if (ret_sub) + ret = ret_sub; } - for (i = 0; i < sbio->count; ++i) { - page = sbio->bio->bi_io_vec[i].bv_page; - buffer = kmap_atomic(page, KM_USER0); - flags = sbio->spag[i].flags; - logical = sbio->logical + i * PAGE_SIZE; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = scrub_checksum_tree_block(sdev, sbio->spag + i, - logical, buffer); - } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { - BUG_ON(i); - (void)scrub_checksum_super(sbio, buffer); - } else { - WARN_ON(1); - } - kunmap_atomic(buffer, KM_USER0); - if (ret) { - ret = scrub_recheck_error(sbio, i); - if (!ret) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.unverified_errors; - spin_unlock(&sdev->stat_lock); - } + + return ret; +} + +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int page_num, int force_write) +{ + struct scrub_page *page_bad = sblock_bad->pagev + page_num; + struct scrub_page *page_good = sblock_good->pagev + page_num; + + BUG_ON(sblock_bad->pagev[page_num].page == NULL); + BUG_ON(sblock_good->pagev[page_num].page == NULL); + if (force_write || sblock_bad->header_error || + sblock_bad->checksum_error || page_bad->io_error) { + struct bio *bio; + int ret; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = page_bad->bdev; + bio->bi_sector = page_bad->physical >> 9; + bio->bi_end_io = scrub_complete_bio_end_io; + bio->bi_private = &complete; + + ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + if (PAGE_SIZE != ret) { + bio_put(bio); + return -EIO; } + btrfsic_submit_bio(WRITE, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + bio_put(bio); } -out: - scrub_free_bio(sbio->bio); - sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); + return 0; +} + +static void scrub_checksum(struct scrub_block *sblock) +{ + u64 flags; + int ret; + + BUG_ON(sblock->page_count < 1); + flags = sblock->pagev[0].flags; + ret = 0; + if (flags & BTRFS_EXTENT_FLAG_DATA) + ret = scrub_checksum_data(sblock); + else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = scrub_checksum_tree_block(sblock); + else if (flags & BTRFS_EXTENT_FLAG_SUPER) + (void)scrub_checksum_super(sblock); + else + WARN_ON(1); + if (ret) + scrub_handle_errored_block(sblock); } -static int scrub_checksum_data(struct scrub_dev *sdev, - struct scrub_page *spag, void *buffer) +static int scrub_checksum_data(struct scrub_block *sblock) { + struct scrub_dev *sdev = sblock->sdev; u8 csum[BTRFS_CSUM_SIZE]; + u8 *on_disk_csum; + struct page *page; + void *buffer; u32 crc = ~(u32)0; int fail = 0; struct btrfs_root *root = sdev->dev->dev_root; + u64 len; + int index; - if (!spag->have_csum) + BUG_ON(sblock->page_count < 1); + if (!sblock->pagev[0].have_csum) return 0; - crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); + on_disk_csum = sblock->pagev[0].csum; + page = sblock->pagev[0].page; + buffer = kmap_atomic(page, KM_USER0); + + len = sdev->sectorsize; + index = 0; + for (;;) { + u64 l = min_t(u64, len, PAGE_SIZE); + + crc = btrfs_csum_data(root, buffer, crc, l); + kunmap_atomic(buffer, KM_USER0); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + buffer = kmap_atomic(page, KM_USER0); + } + btrfs_csum_final(crc, csum); - if (memcmp(csum, spag->csum, sdev->csum_size)) + if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - spin_lock(&sdev->stat_lock); - ++sdev->stat.data_extents_scrubbed; - sdev->stat.data_bytes_scrubbed += PAGE_SIZE; - if (fail) + if (fail) { + spin_lock(&sdev->stat_lock); ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); + spin_unlock(&sdev->stat_lock); + } return fail; } -static int scrub_checksum_tree_block(struct scrub_dev *sdev, - struct scrub_page *spag, u64 logical, - void *buffer) +static int scrub_checksum_tree_block(struct scrub_block *sblock) { + struct scrub_dev *sdev = sblock->sdev; struct btrfs_header *h; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; int crc_fail = 0; + u64 len; + int index; + + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + h = (struct btrfs_header *)mapped_buffer; + memcpy(on_disk_csum, h->csum, sdev->csum_size); /* * we don't use the getter functions here, as we * a) don't have an extent buffer and * b) the page is already kmapped */ - h = (struct btrfs_header *)buffer; - if (logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) ++fail; - if (spag->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -886,51 +1306,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, h->csum, sdev->csum_size)) + BUG_ON(sdev->nodesize != sdev->leafsize); + len = sdev->nodesize - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer, KM_USER0); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++crc_fail; - spin_lock(&sdev->stat_lock); - ++sdev->stat.tree_extents_scrubbed; - sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); + if (crc_fail || fail) { + spin_lock(&sdev->stat_lock); + if (crc_fail) + ++sdev->stat.csum_errors; + if (fail) + ++sdev->stat.verify_errors; + spin_unlock(&sdev->stat_lock); + } return fail || crc_fail; } -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) +static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; - u64 logical; - struct scrub_dev *sdev = sbio->sdev; + struct scrub_dev *sdev = sblock->sdev; struct btrfs_root *root = sdev->dev->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; - u8 csum[BTRFS_CSUM_SIZE]; + u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; + struct page *page; + void *mapped_buffer; + u64 mapped_size; + void *p; u32 crc = ~(u32)0; int fail = 0; + u64 len; + int index; - s = (struct btrfs_super_block *)buffer; - logical = sbio->logical; + BUG_ON(sblock->page_count < 1); + page = sblock->pagev[0].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + s = (struct btrfs_super_block *)mapped_buffer; + memcpy(on_disk_csum, s->csum, sdev->csum_size); - if (logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) ++fail; - if (sbio->spag[0].generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) ++fail; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) ++fail; - crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, - PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; + mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; + p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + index = 0; + for (;;) { + u64 l = min_t(u64, len, mapped_size); + + crc = btrfs_csum_data(root, p, crc, l); + kunmap_atomic(mapped_buffer, KM_USER0); + len -= l; + if (len == 0) + break; + index++; + BUG_ON(index >= sblock->page_count); + BUG_ON(!sblock->pagev[index].page); + page = sblock->pagev[index].page; + mapped_buffer = kmap_atomic(page, KM_USER0); + mapped_size = PAGE_SIZE; + p = mapped_buffer; + } + + btrfs_csum_final(crc, calculated_csum); + if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++fail; if (fail) { @@ -947,6 +1415,23 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) return fail; } +static void scrub_block_get(struct scrub_block *sblock) +{ + atomic_inc(&sblock->ref_count); +} + +static void scrub_block_put(struct scrub_block *sblock) +{ + if (atomic_dec_and_test(&sblock->ref_count)) { + int i; + + for (i = 0; i < sblock->page_count; i++) + if (sblock->pagev[i].page) + __free_page(sblock->pagev[i].page); + kfree(sblock); + } +} + static void scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; @@ -955,19 +1440,17 @@ static void scrub_submit(struct scrub_dev *sdev) return; sbio = sdev->bios[sdev->curr]; - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); btrfsic_submit_bio(READ, sbio->bio); } -static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) +static int scrub_add_page_to_bio(struct scrub_dev *sdev, + struct scrub_page *spage) { + struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; - struct page *page; int ret; again: @@ -980,7 +1463,7 @@ again: if (sdev->curr != -1) { sdev->first_free = sdev->bios[sdev->curr]->next_free; sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->count = 0; + sdev->bios[sdev->curr]->page_count = 0; spin_unlock(&sdev->list_lock); } else { spin_unlock(&sdev->list_lock); @@ -988,53 +1471,200 @@ again: } } sbio = sdev->bios[sdev->curr]; - if (sbio->count == 0) { + if (sbio->page_count == 0) { struct bio *bio; - sbio->physical = physical; - sbio->logical = logical; - bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - return -ENOMEM; + sbio->physical = spage->physical; + sbio->logical = spage->logical; + bio = sbio->bio; + if (!bio) { + bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); + if (!bio) + return -ENOMEM; + sbio->bio = bio; + } bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; + bio->bi_sector = spage->physical >> 9; sbio->err = 0; - sbio->bio = bio; - } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || - sbio->logical + sbio->count * PAGE_SIZE != logical) { + } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + spage->physical || + sbio->logical + sbio->page_count * PAGE_SIZE != + spage->logical) { scrub_submit(sdev); goto again; } - sbio->spag[sbio->count].flags = flags; - sbio->spag[sbio->count].generation = gen; - sbio->spag[sbio->count].have_csum = 0; - sbio->spag[sbio->count].mirror_num = mirror_num; - - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); + sbio->pagev[sbio->page_count] = spage; + ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + if (sbio->page_count < 1) { + bio_put(sbio->bio); + sbio->bio = NULL; + return -EIO; + } scrub_submit(sdev); goto again; } - if (csum) { - sbio->spag[sbio->count].have_csum = 1; - memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); + scrub_block_get(sblock); /* one for the added page */ + atomic_inc(&sblock->outstanding_pages); + sbio->page_count++; + if (sbio->page_count == sdev->pages_per_bio) + scrub_submit(sdev); + + return 0; +} + +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, + u64 physical, u64 flags, u64 gen, int mirror_num, + u8 *csum, int force) +{ + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page later on */ + atomic_set(&sblock->ref_count, 1); + sblock->sdev = sdev; + sblock->no_io_error_seen = 1; + + for (index = 0; len > 0; index++) { + struct scrub_page *spage = sblock->pagev + index; + u64 l = min_t(u64, len, PAGE_SIZE); + + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) { + spin_lock(&sdev->stat_lock); + sdev->stat.malloc_errors++; + spin_unlock(&sdev->stat_lock); + while (index > 0) { + index--; + __free_page(sblock->pagev[index].page); + } + kfree(sblock); + return -ENOMEM; + } + spage->sblock = sblock; + spage->bdev = sdev->dev->bdev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sdev->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + len -= l; + logical += l; + physical += l; + } + + BUG_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev + index; + int ret; + + ret = scrub_add_page_to_bio(sdev, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } } - ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) + + if (force) scrub_submit(sdev); + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); return 0; } +static void scrub_bio_end_io(struct bio *bio, int err) +{ + struct scrub_bio *sbio = bio->bi_private; + struct scrub_dev *sdev = sbio->sdev; + struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + + sbio->err = err; + sbio->bio = bio; + + btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); +} + +static void scrub_bio_end_io_worker(struct btrfs_work *work) +{ + struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); + struct scrub_dev *sdev = sbio->sdev; + int i; + + BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); + if (sbio->err) { + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + + spage->io_error = 1; + spage->sblock->no_io_error_seen = 0; + } + } + + /* now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->page_count; i++) { + struct scrub_page *spage = sbio->pagev[i]; + struct scrub_block *sblock = spage->sblock; + + if (atomic_dec_and_test(&sblock->outstanding_pages)) + scrub_block_complete(sblock); + scrub_block_put(sblock); + } + + if (sbio->err) { + /* what is this good for??? */ + sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bio->bi_phys_segments = 0; + sbio->bio->bi_idx = 0; + + for (i = 0; i < sbio->page_count; i++) { + struct bio_vec *bi; + bi = &sbio->bio->bi_io_vec[i]; + bi->bv_offset = 0; + bi->bv_len = PAGE_SIZE; + } + } + + bio_put(sbio->bio); + sbio->bio = NULL; + spin_lock(&sdev->list_lock); + sbio->next_free = sdev->first_free; + sdev->first_free = sbio->index; + spin_unlock(&sdev->list_lock); + atomic_dec(&sdev->in_flight); + wake_up(&sdev->list_wait); +} + +static void scrub_block_complete(struct scrub_block *sblock) +{ + if (!sblock->no_io_error_seen) + scrub_handle_errored_block(sblock); + else + scrub_checksum(sblock); +} + static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum) { @@ -1042,7 +1672,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, int ret = 0; unsigned long i; unsigned long num_sectors; - u32 sectorsize = sdev->dev->dev_root->sectorsize; while (!list_empty(&sdev->csum_list)) { sum = list_first_entry(&sdev->csum_list, @@ -1060,7 +1689,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, if (!sum) return 0; - num_sectors = sum->len / sectorsize; + num_sectors = sum->len / sdev->sectorsize; for (i = 0; i < num_sectors; ++i) { if (sum->sums[i].bytenr == logical) { memcpy(csum, &sum->sums[i].sum, sdev->csum_size); @@ -1081,9 +1710,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, { int ret; u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sdev->sectorsize; + spin_lock(&sdev->stat_lock); + sdev->stat.data_extents_scrubbed++; + sdev->stat.data_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + BUG_ON(sdev->nodesize != sdev->leafsize); + blocksize = sdev->nodesize; + spin_lock(&sdev->stat_lock); + sdev->stat.tree_extents_scrubbed++; + sdev->stat.tree_bytes_scrubbed += len; + spin_unlock(&sdev->stat_lock); + } else { + blocksize = sdev->sectorsize; + BUG_ON(1); + } while (len) { - u64 l = min_t(u64, len, PAGE_SIZE); + u64 l = min_t(u64, len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -1092,8 +1740,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, if (have_csum == 0) ++sdev->stat.no_csum; } - ret = scrub_page(sdev, logical, l, physical, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); + ret = scrub_pages(sdev, logical, l, physical, flags, gen, + mirror_num, have_csum ? csum : NULL, 0); if (ret) return ret; len -= l; @@ -1158,6 +1806,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; + /* + * work on commit root. The related disk blocks are static as + * long as COW is applied. This means, it is save to rewrite + * them to repair disk errors without any race conditions + */ path->search_commit_root = 1; path->skip_locking = 1; @@ -1511,8 +2164,8 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) break; - ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); + ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); if (ret) return ret; } @@ -1571,10 +2224,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, /* * check some assumptions */ - if (root->sectorsize != PAGE_SIZE || - root->sectorsize != root->leafsize || - root->sectorsize != root->nodesize) { - printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); + if (root->nodesize != root->leafsize) { + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", + root->nodesize, root->leafsize); + return -EINVAL; + } + + if (root->nodesize > BTRFS_STRIPE_LEN) { + /* + * in this case scrub is unable to calculate the checksum + * the way scrub is implemented. Do not handle this + * situation at all because it won't ever happen. + */ + printk(KERN_ERR + "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", + root->nodesize, BTRFS_STRIPE_LEN); + return -EINVAL; + } + + if (root->sectorsize != PAGE_SIZE) { + /* not supported for data w/o checksums */ + printk(KERN_ERR + "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", + root->sectorsize, (unsigned long long)PAGE_SIZE); return -EINVAL; } -- cgit v1.2.3 From fcd1f065daca593badb7f99d473639cf3b551795 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 6 Mar 2012 00:06:18 +0100 Subject: Btrfs: enhance superblock sanity checks Validate checksum algorithm during mount and prevent BUG_ON later in btrfs_super_csum_size. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 438993e3d83..38c2ee1ca0d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -48,7 +48,7 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); static void btrfs_destroy_ordered_operations(struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); @@ -2135,7 +2135,12 @@ int open_ctree(struct super_block *sb, /* check FS state, whether FS is broken. */ fs_info->fs_state |= btrfs_super_flags(disk_super); - btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + err = ret; + goto fail_alloc; + } /* * run through our array of backup supers and setup @@ -3324,15 +3329,23 @@ out: return 0; } -static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { + if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "btrfs: unsupported checksum algorithm\n"); + return -EINVAL; + } + if (read_only) - return; + return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { printk(KERN_WARNING "warning: mount fs with errors, " "running btrfsck is recommended\n"); + } + + return 0; } int btrfs_error_commit_super(struct btrfs_root *root) -- cgit v1.2.3 From 65139ed99234d8505948cdb7a835452eb5c191f9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 Feb 2012 12:26:09 +0100 Subject: btrfs: disallow unequal data/metadata blocksize for mixed block groups With support for bigger metadata blocks, we must avoid mounting a filesystem with different block size for mixed block groups, this causes corruption (found by xfstests/083). Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 38c2ee1ca0d..fe087847c8e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2302,6 +2302,14 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } + if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (leafsize != nodesize || sectorsize != nodesize)) { + printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + "are not allowed for mixed block groups on %s\n", + sb->s_id); + goto fail_sb_buffer; + } + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); -- cgit v1.2.3 From 3c4bb26b213e618473e486776483a5bad15ba6da Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 27 Mar 2012 18:56:56 -0400 Subject: Btrfs: flush out and clean up any block device pages during mount Btrfs puts the filesystem metadata into its own address space, and somehow the block device address space isn't getting onto disk properly before a mount. The end result is that a loop of mkfs and mounting the filesystem will sometimes find stale or incorrect data. This commit should fix it by sprinkling fdatawrites and invalidate_bdev calls around. This is a short term measure to make sure it is fixed. The block devices really should be flushed and cleaned up higher in the stack. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + fs/btrfs/volumes.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7b55eee15a5..fd43f6b2f43 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2031,6 +2031,7 @@ int open_ctree(struct super_block *sb, __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); + invalidate_bdev(fs_devices->latest_bdev); bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) { err = -EINVAL; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 68a1754fe36..a872b48be0a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -622,6 +622,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, printk(KERN_INFO "open %s failed\n", device->name); goto error; } + filemap_write_and_wait(bdev->bd_inode->i_mapping); + invalidate_bdev(bdev); set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); @@ -1354,6 +1356,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } set_blocksize(bdev, 4096); + invalidate_bdev(bdev); bh = btrfs_read_dev_super(bdev); if (!bh) { ret = -EINVAL; -- cgit v1.2.3 From 7ca4be45a0255ac8f08c05491c6add2dd87dd4f8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Jan 2012 20:19:02 -0500 Subject: Btrfs: don't use crc items bigger than 4KB With the big metadata blocks, we can have crc items that are much bigger than a page. There are a few places that we try to kmalloc memory to hold the items during a split. Items bigger than 4KB don't really have a huge benefit in efficiency, but they do trigger larger order allocations. This commits changes the csums to make sure they stay under 4KB. This is not a format change, just a #define to limit huge items. Signed-off-by: Chris Mason --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a14dbca5974..cab0ffb5ef3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -25,10 +25,12 @@ #include "transaction.h" #include "print-tree.h" -#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ +#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) +#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) + #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ sizeof(struct btrfs_sector_sum) * \ -- cgit v1.2.3 From 2bcc0328c3a043880796a602c75fbeb1537aa1e1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:44 -0400 Subject: Btrfs: show useful info in space reservation tracepoint o For space info, the type of space info is useful for debug. o For transaction handle, its transid is useful. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 29 ++++++++++------------------- fs/btrfs/inode-map.c | 6 ++---- fs/btrfs/transaction.c | 3 +-- 3 files changed, 13 insertions(+), 25 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8fe517bd852..7c233407bee 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3363,8 +3363,7 @@ commit_trans: } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)(unsigned long)data_sinfo, - bytes, 1); + data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3385,8 +3384,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)(unsigned long)data_sinfo, - bytes, 0); + data_sinfo->flags, bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3751,9 +3749,7 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)(unsigned long)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { /* @@ -3822,9 +3818,7 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)(unsigned long)space_info, - orig_bytes, 1); + "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3970,8 +3964,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)space_info, - num_bytes, 0); + space_info->flags, num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4189,14 +4182,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)sinfo, num_bytes, 1); + sinfo->flags, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)(unsigned long)sinfo, num_bytes, 0); + sinfo->flags, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4250,8 +4243,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, return; trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)(unsigned long)trans, - trans->bytes_reserved, 0); + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -4770,9 +4762,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)(unsigned long)space_info, - num_bytes, 0); + "space_info", space_info->flags, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 7ca46e6e11a..b1a1c929ba8 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -439,8 +439,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (ret) goto out; trace_btrfs_space_reservation(root->fs_info, "ino_cache", - (u64)(unsigned long)trans, - trans->bytes_reserved, 1); + trans->transid, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { @@ -507,8 +506,7 @@ out_put: iput(inode); out_release: trace_btrfs_space_reservation(root->fs_info, "ino_cache", - (u64)(unsigned long)trans, - trans->bytes_reserved, 0); + trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 63f835aa978..8da29e8e4de 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -337,8 +337,7 @@ again: if (num_bytes) { trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)(unsigned long)h, - num_bytes, 1); + h->transid, num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } -- cgit v1.2.3 From 15d1ff8111aad85d8b40ee396758990d17a2caac Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:44 -0400 Subject: Btrfs: fix deadlock during allocating chunks This deadlock comes from xfstests 251. We'll hold the chunk_mutex throughout the whole of a chunk allocation. But if we find that we've used up system chunk space, we need to allocate a new system chunk, but this will lead to a recursion of chunk allocation and end up with a deadlock on chunk_mutex. So instead we need to allocate the system chunk first if we find we're in ENOSPC. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7c233407bee..a84420491c1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3445,6 +3445,50 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } +static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +{ + u64 num_dev; + + if (type & BTRFS_BLOCK_GROUP_RAID10 || + type & BTRFS_BLOCK_GROUP_RAID0) + num_dev = root->fs_info->fs_devices->rw_devices; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_dev = 2; + else + num_dev = 1; /* DUP or single */ + + /* metadata for updaing devices and chunk tree */ + return btrfs_calc_trans_metadata_size(root, num_dev + 1); +} + +static void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + struct btrfs_space_info *info; + u64 left; + u64 thresh; + + info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly; + spin_unlock(&info->lock); + + thresh = get_system_chunk_thresh(root, type); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", + left, thresh, type); + dump_space_info(info, 0, 0); + } + + if (left < thresh) { + u64 flags; + + flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + btrfs_alloc_chunk(trans, root, flags); + } +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) @@ -3515,6 +3559,12 @@ again: force_metadata_allocation(fs_info); } + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, extent_root, flags); + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret < 0 && ret != -ENOSPC) goto out; -- cgit v1.2.3 From ecb8bea87d05fd2d1fc0718e1e4bbf09c7c6045a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:44 -0400 Subject: Btrfs: fix race between direct io and autodefrag The bug is from running xfstests 209 with autodefrag. The race is as follows: t1 t2(autodefrag) direct IO invalidate pagecache dio(old data) add_inode_defrag invalidate pagecache endio direct IO invalidate pagecache run_defrag readpage(old data) set page dirty (old data) dio(new data, rewrite) invalidate pagecache (*) endio t2(autodefrag) will get old data into pagecache via readpage and set pagecache dirty. Meanwhile, invalidate pagecache(*) will fail due to dirty flags in pages. So the old data may be flushed into disk by flush thread, which will lead to data loss. And so does the case of user defragment progs. The patch fixes this race by holding i_mutex when we readpage and set page dirty. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a979ab7d396..45910d4b8f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1137,12 +1137,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += max_cluster; } + mutex_lock(&inode->i_mutex); ret = cluster_pages_for_defrag(inode, pages, i, cluster); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&inode->i_mutex); goto out_ra; + } defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + mutex_unlock(&inode->i_mutex); if (newer_than) { if (newer_off == (u64)-1) -- cgit v1.2.3 From 1f12bd063285b059cb63315d1424dae1ddd87a64 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:44 -0400 Subject: Btrfs: fix the mismatch of page->mapping commit 600a45e1d5e376f679ff9ecc4ce9452710a6d27c (Btrfs: fix deadlock on page lock when doing auto-defragment) fixes the deadlock on page, but it also introduces another bug. A page may have been truncated after unlock & lock. So we need to find it again to get the right one. And since we've held i_mutex lock, inode size remains unchanged and we can drop isize overflow checks. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 45910d4b8f6..de46fa21859 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -871,6 +871,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 isize = i_size_read(inode); u64 page_start; u64 page_end; + u64 page_cnt; int ret; int i; int i_done; @@ -879,19 +880,21 @@ static int cluster_pages_for_defrag(struct inode *inode, struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - if (isize == 0) - return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || start_index > file_end) + return 0; + + page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ - for (i = 0; i < num_pages; i++) { + for (i = 0; i < page_cnt; i++) { struct page *page; again: page = find_or_create_page(inode->i_mapping, @@ -913,6 +916,15 @@ again: btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); + /* + * we unlocked the page above, so we need check if + * it was released or not. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } } if (!PageUptodate(page)) { @@ -926,15 +938,6 @@ again: } } - isize = i_size_read(inode); - file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end) { - /* whoops, we blew past eof, skip this page */ - unlock_page(page); - page_cache_release(page); - break; - } - if (page->mapping != inode->i_mapping) { unlock_page(page); page_cache_release(page); @@ -967,12 +970,12 @@ again: EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, GFP_NOFS); - if (i_done != num_pages) { + if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (num_pages - i_done) << PAGE_CACHE_SHIFT); + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -997,7 +1000,7 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); return ret; } -- cgit v1.2.3 From 4cb13e5d6ecc47b91b24a35f8fbc2c9f33d075fe Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:45 -0400 Subject: Btrfs: fix recursive defragment with autodefrag option $ mkfs.btrfs disk $ mount disk /mnt -o autodefrag $ dd if=/dev/zero of=/mnt/foobar bs=4k count=10 2>/dev/null && sync $ for i in `seq 9 -2 0`; do dd if=/dev/zero of=/mnt/foobar bs=4k count=1 \ seek=$i conv=notrunc 2> /dev/null; done && sync then we'll get to defrag "foobar" again and again. So does option "-o autodefrag,compress". Reasons: When the cleaner kthread gets to fetch inodes from the defrag tree and defrag them, it will dirty pages and submit them, this will comes to another DATA COW where the processing inode will be inserted to the defrag tree again. This patch sets a rule for COW code, i.e. insert an inode when we're really going to make some defragments. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eb6aec7bbac..1be31368e88 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -347,8 +347,9 @@ static noinline int compress_file_range(struct inode *inode, int will_compress; int compress_type = root->fs_info->compress_type; - /* if this is a small write inside eof, kick off a defragbot */ - if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024) + /* if this is a small write inside eof, kick off a defrag */ + if ((end - start + 1) < 16 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); actual_end = min_t(u64, isize, end + 1); @@ -843,7 +844,8 @@ static noinline int cow_file_range(struct inode *inode, ret = 0; /* if this is a small write inside eof, kick off defrag */ - if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024) + if (num_bytes < 64 * 1024 && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(trans, inode); if (start == 0) { -- cgit v1.2.3 From 17ce6ef8d731af5edac8c39e806db4c7e1f6956f Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:45 -0400 Subject: Btrfs: add a check to decide if we should defrag the range If our file's layout is as follows: | hole | data1 | hole | data2 | we do not need to defrag this file, because this file has holes and cannot be merged into one extent. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de46fa21859..6c39d1a5590 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -784,6 +784,31 @@ none: return -ENOENT; } +/* + * Validaty check of prev em and next em: + * 1) no prev/next em + * 2) prev/next em is an hole/inline extent + */ +static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *prev = NULL, *next = NULL; + int ret = 0; + + read_lock(&em_tree->lock); + prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); + next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); + read_unlock(&em_tree->lock); + + if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && + (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) + ret = 1; + free_extent_map(prev); + free_extent_map(next); + + return ret; +} + static int should_defrag_range(struct inode *inode, u64 start, u64 len, int thresh, u64 *last_len, u64 *skip, u64 *defrag_end) @@ -821,8 +846,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, } /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + ret = 0; + goto out; + } + + /* If we have nothing to merge with us, just skip. */ + if (check_adjacent_extents(inode, em)) { ret = 0; + goto out; + } /* * we hit a real extent, if it is big don't bother defragging it again @@ -830,6 +863,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) ret = 0; +out: /* * last_len ends up being a counter of how many bytes we've defragged. * every time we choose not to defrag an extent, we reset *last_len -- cgit v1.2.3 From 66c2689226ac322fbc9acd2e8e418b78dcd52f51 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:45 -0400 Subject: Btrfs: do not bother to defrag an extent if it is a big real extent $ mkfs.btrfs /dev/sdb7 $ mount /dev/sdb7 /mnt/btrfs/ -oautodefrag $ dd if=/dev/zero of=/mnt/btrfs/foobar bs=4k count=10 oflag=direct 2>/dev/null $ filefrag -v /mnt/btrfs/foobar Filesystem type is: 9123683e File size of /mnt/btrfs/foobar is 40960 (10 blocks, blocksize 4096) ext logical physical expected length flags 0 0 3072 10 eof /mnt/btrfs/foobar: 1 extent found Now we have a big real extent [0, 40960), but autodefrag will still defrag it. $ sync $ filefrag -v /mnt/btrfs/foobar Filesystem type is: 9123683e File size of /mnt/btrfs/foobar is 40960 (10 blocks, blocksize 4096) ext logical physical expected length flags 0 0 3082 10 eof /mnt/btrfs/foobar: 1 extent found So if we already find a big real extent, we're ok about that, just skip it. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c39d1a5590..afde837644e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1140,12 +1140,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; - if (!newer_than && - !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, - extent_thresh, - &last_len, &skip, - &defrag_end)) { + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, extent_thresh, + &last_len, &skip, &defrag_end)) { unsigned long next; /* * the should_defrag function tells us how much to skip -- cgit v1.2.3 From e1f041e14cfb322f41f41a308bfede00f1b080cd Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Mar 2012 09:57:45 -0400 Subject: Btrfs: update to the right index of defragment When we use autodefrag, we forget to update the index which indicates the last page we've dirty. And we'll set dirty flags on a same set of pages again and again. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index afde837644e..18cc23d164a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1186,6 +1186,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (newer_off == (u64)-1) break; + if (ret > 0) + i += ret; + newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); -- cgit v1.2.3 From bc3f116fec194f1d7329b160c266fe16b9266a1e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 29 Mar 2012 17:02:47 -0400 Subject: Btrfs: update the checks for mixed block groups with big metadata blocks Dave Sterba had put in patches to look for mixed data/metadata groups with metadata bigger than 4KB. But these ended up in the wrong place and it wasn't testing the feature flag correctly. This updates the tests to make sure our sizes are matching Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fd43f6b2f43..20196f41120 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2120,6 +2120,23 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + + /* + * mixed block groups end up with duplicate but slightly offset + * extent buffers for the same range. It leads to corruptions + */ + if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (sectorsize != leafsize)) { + printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " + "are not allowed for mixed block groups on %s\n", + sb->s_id); + goto fail_alloc; + } + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -2223,10 +2240,6 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); - nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); tree_root->nodesize = nodesize; tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; @@ -2247,14 +2260,6 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (leafsize != nodesize || sectorsize != nodesize)) { - printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); - goto fail_sb_buffer; - } - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); -- cgit v1.2.3 From 9c017abc50f00e80aee6a705a9207fa818a48dda Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 27 Feb 2012 00:04:56 +0100 Subject: btrfs: assignment in write_dev_flush() doesn't need two semi-colons One is enough. Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 811d9f918b1..d24a841a872 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2732,7 +2732,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) * one reference for us, and we leave it for the * caller */ - device->flush_bio = NULL;; + device->flush_bio = NULL; bio = bio_alloc(GFP_NOFS, 0); if (!bio) return -ENOMEM; -- cgit v1.2.3 From 8e62c2de6e23e5c1fee04f59de51b54cc2868ca5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Apr 2012 13:46:48 -0400 Subject: Revert "Btrfs: increase the global block reserve estimates" This reverts commit 5500cdbe14d7435e04f66ff3cfb8ecd8b8e44ebf. We've had a number of complaints of early enospc that bisect down to this patch. We'll hae to fix the reservations differently. CC: stable@kernel.org Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a84420491c1..ace5e8cef03 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4205,7 +4205,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes += div64_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3) * 2; + num_bytes = div64_u64(meta_used, 3); return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); } -- cgit v1.2.3 From d95603b262edb53d6016a8df0c150371d4d61e67 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Apr 2012 15:55:15 -0400 Subject: Btrfs: fix uninit variable in repair_eb_io_failure We'd have to be passing bogus extent buffers for this uninit variable to actually be used, but set it to zero just in case. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0c3ec003f27..59ec105444f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1937,7 +1937,7 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); - int ret; + int ret = 0; for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); -- cgit v1.2.3 From b89203f74bdfcb15407d54d3f257b16a2ea19e62 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: fix eof while discarding extents We miscalculate the length of extents we're discarding, and it leads to an eof of device. Reported-by: Daniel Blueman Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a872b48be0a..759d02486d7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3833,6 +3833,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; + u32 last_stripe = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -3846,6 +3847,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, stripe_nr_orig, factor, &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; } for (i = 0; i < num_stripes; i++) { @@ -3858,16 +3861,29 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, BTRFS_BLOCK_GROUP_RAID10)) { bbio->stripes[i].length = stripes_per_dev * map->stripe_len; + if (i / sub_stripes < remaining_stripes) bbio->stripes[i].length += map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - if ((i / sub_stripes + 1) % - sub_stripes == remaining_stripes) + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) bbio->stripes[i].length -= stripe_end_offset; + if (i == sub_stripes - 1) stripe_offset = 0; } else -- cgit v1.2.3 From c6664b42c4e567792abdb17c958fb01c5bcfcb3a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: remove lock assert from get_restripe_target() This fixes a regression introduced by fc67c450. spin_is_locked() always returns 0 on UP kernels, which caused assert in get_restripe_target() to be fired on every call from btrfs_reduce_alloc_profile() on UP systems. Remove it completely for now, it's not clear if it's going to be needed in future. Reported-by: Bobby Powers Reported-by: Mitch Harder Tested-by: Mitch Harder Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ace5e8cef03..a2134d8141c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3152,15 +3152,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) /* * returns target flags in extended format or 0 if restripe for this * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held */ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; - BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) && - !spin_is_locked(&fs_info->balance_lock)); - if (!bctl) return 0; -- cgit v1.2.3 From e627ee7bcd42b4e3a03ca01a8e46dcb4033c5ae0 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: check return value of bio_alloc() properly bio_alloc() has the possibility of returning NULL. So, it is necessary to check the return value. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 2 ++ fs/btrfs/extent_io.c | 4 ++++ fs/btrfs/scrub.c | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d11afa67c7d..646f5e6f256 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -405,6 +405,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + BUG_ON(!bio); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -687,6 +688,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + BUG_ON(!comp_bio); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 59ec105444f..4789770f8ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2180,6 +2180,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, } bio = bio_alloc(GFP_NOFS, 1); + if (!bio) { + free_io_failure(inode, failrec, 0); + return -EIO; + } bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c9a2c1aef4b..60f0e28db31 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1044,6 +1044,8 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, BUG_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; bio->bi_bdev = page->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; @@ -1172,6 +1174,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, DECLARE_COMPLETION_ONSTACK(complete); bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; bio->bi_bdev = page_bad->bdev; bio->bi_sector = page_bad->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; -- cgit v1.2.3 From 4edc2ca388d62abffe38149f6ac00e749ea721c5 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: fix use-after-free in __btrfs_end_transaction 49b25e0540904be0bf558b84475c69d72e4de66e introduced a use-after-free bug that caused spurious -EIO's to be returned. Do the check before we free the transaction. Cc: David Sterba Cc: Jeff Mahoney Signed-off-by: Dave Jones Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8da29e8e4de..11b77a59db6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -480,6 +480,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; + int err = 0; if (--trans->use_count) { trans->block_rsv = trans->orig_rsv; @@ -532,18 +533,18 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (current->journal_info == trans) current->journal_info = NULL; - memset(trans, 0, sizeof(*trans)); - kmem_cache_free(btrfs_trans_handle_cachep, trans); if (throttle) btrfs_run_delayed_iputs(root); if (trans->aborted || root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - return -EIO; + err = -EIO; } - return 0; + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + return err; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From d53ba47484ed6245e640ee4bfe9d21e9bfc15765 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Apr 2012 16:03:57 -0400 Subject: Btrfs: use commit root when loading free space cache A user reported that booting his box up with btrfs root on 3.4 was way slower than on 3.3 because I removed the ideal caching code. It turns out that we don't load the free space cache if we're in a commit for deadlock reasons, but since we're reading the cache and it hasn't changed yet we are safe reading the inode and free space item from the commit root, so do that and remove all of the deadlock checks so we don't unnecessarily skip loading the free space cache. The user reported this fixed the slowness. Thanks, Tested-by: Calvin Walton Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 +--- fs/btrfs/free-space-cache.c | 9 ++------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a2134d8141c..2b35f8d14bb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -529,9 +529,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * allocate blocks for the tree root we can't do the fast caching since * we likely hold important locks. */ - if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root) && - btrfs_test_opt(root, SPACE_CACHE)) { + if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 054707ed579..baaa518baaf 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -747,13 +747,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, bool matched; u64 used = btrfs_block_group_used(&block_group->item); - /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. @@ -768,6 +761,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, path = btrfs_alloc_path(); if (!path) return 0; + path->search_commit_root = 1; + path->skip_locking = 1; inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) { -- cgit v1.2.3 From 6ed3cf2cdfce4c9f1d73171bd3f27d9cb77b734e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 11:49:04 -0400 Subject: btrfs: btrfs_root_readonly() broken on big-endian ->root_flags is __le64 and all accesses to it go through the helpers that do proper conversions. Except for btrfs_root_readonly(), which checks bit 0 as in host-endian... Signed-off-by: Al Viro --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5b8ef8eb352..3f65a812e28 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2166,7 +2166,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, static inline bool btrfs_root_readonly(struct btrfs_root *root) { - return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } /* struct btrfs_root_backup */ -- cgit v1.2.3 From 848cce0d4102b5b4b26b0987b43e1919d462afe2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Feb 2012 17:04:28 +0800 Subject: Btrfs: avoid setting ->d_op twice Follow those instructions, and you'll trigger a warning in the beginning of d_set_d_op(): # mkfs.btrfs /dev/loop3 # mount /dev/loop3 /mnt # btrfs sub create /mnt/sub # btrfs sub snap /mnt /mnt/snap # touch /mnt/snap/sub touch: cannot touch `tmp': Permission denied __d_alloc() set d_op to sb->s_d_op (btrfs_dentry_operations), and then simple_lookup() reset it to simple_dentry_operations, which triggered the warning. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1be31368e88..a682c267576 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4069,7 +4069,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->dummy_inode = 1; inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -4140,14 +4140,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; + struct inode *inode = dentry->d_inode; - if (!dentry->d_inode && !IS_ROOT(dentry)) - dentry = dentry->d_parent; + if (!inode && !IS_ROOT(dentry)) + inode = dentry->d_parent->d_inode; - if (dentry->d_inode) { - root = BTRFS_I(dentry->d_inode)->root; + if (inode) { + root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; + + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; } return 0; } -- cgit v1.2.3 From 8c9c2bf7a3c4f7e9d158c0be9c49f372fb943ad2 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 25 Feb 2012 09:09:30 +0100 Subject: btrfs: fix race in reada When inserting into the radix tree returns EEXIST, get the existing entry without giving up the spinlock in between. There was a race for both the zones trees and the extent tree. Signed-off-by: Arne Jansen --- fs/btrfs/inode.c | 8 +++++++- fs/btrfs/reada.c | 35 ++++++++++++++++------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a682c267576..98ee5a51aa2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4332,7 +4332,13 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, } no_dentry: /* is this a reference to our own snapshot? If so - * skip it + * skip it. + * + * In contrast to old kernels, we insert the snapshot's + * dir item and dir index after it has been created, so + * we won't find a reference to our own snapshot. We + * still keep the following code for backward + * compatibility. */ if (location.type == BTRFS_ROOT_ITEM_KEY && location.objectid == root->root_key.objectid) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dc5d33146fd..8dec650099c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -250,14 +250,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio) { int ret; - int looped = 0; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; u64 start; u64 end; int i; -again: zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, @@ -274,9 +272,6 @@ again: spin_unlock(&fs_info->reada_lock); } - if (looped) - return NULL; - cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; @@ -307,13 +302,15 @@ again: ret = radix_tree_insert(&dev->reada_zones, (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); - spin_unlock(&fs_info->reada_lock); - if (ret) { + if (ret == -EEXIST) { kfree(zone); - looped = 1; - goto again; + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); } + spin_unlock(&fs_info->reada_lock); return zone; } @@ -323,8 +320,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_key *top, int level) { int ret; - int looped = 0; struct reada_extent *re = NULL; + struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; @@ -335,14 +332,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; -again: spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) kref_get(&re->refcnt); spin_unlock(&fs_info->reada_lock); - if (re || looped) + if (re) return re; re = kzalloc(sizeof(*re), GFP_NOFS); @@ -398,12 +394,15 @@ again: /* insert extent in reada_tree + all per-device trees, all or nothing */ spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret == -EEXIST) { + re_exist = radix_tree_lookup(&fs_info->reada_tree, index); + BUG_ON(!re_exist); + kref_get(&re_exist->refcnt); + spin_unlock(&fs_info->reada_lock); + goto error; + } if (ret) { spin_unlock(&fs_info->reada_lock); - if (ret != -ENOMEM) { - /* someone inserted the extent in the meantime */ - looped = 1; - } goto error; } for (i = 0; i < nzones; ++i) { @@ -450,9 +449,7 @@ error: } kfree(bbio); kfree(re); - if (looped) - goto again; - return NULL; + return re_exist; } static void reada_kref_dummy(struct kref *kr) -- cgit v1.2.3 From 207a232ccac0a8cb79d304bd17298dbc96e2e082 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 25 Feb 2012 09:09:47 +0100 Subject: btrfs: don't add both copies of DUP to reada extent tree Normally when there are 2 copies of a block, we add both to the reada extent tree and prefetch only the one that is easier to reach. This way we can better utilize multiple devices. In case of DUP this makes no sense as both copies reside on the same device. Signed-off-by: Arne Jansen --- fs/btrfs/reada.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 8dec650099c..ac5d0108588 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -326,6 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; + struct btrfs_device *prev_dev; u32 blocksize; u64 length; int nzones = 0; @@ -405,8 +406,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); goto error; } + prev_dev = NULL; for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; + if (dev == prev_dev) { + /* + * in case of DUP, just add the first zone. As both + * are on the same device, there's nothing to gain + * from adding both. + * Also, it wouldn't work, as the tree is per device + * and adding would fail with EEXIST + */ + continue; + } + prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { -- cgit v1.2.3 From 8d082fb727ac11930ea20bf1612e334ea7c2b697 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 3 Apr 2012 09:56:53 +0800 Subject: Btrfs: do not mount when we have a sectorsize unequal to PAGE_SIZE Our code is not ready to cope with a sectorsize that's not equal to PAGE_SIZE. It will lead to hanging-on while writing something. Signed-off-by: Liu Bo --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20196f41120..b9866f27ebd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2254,9 +2254,9 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - if (sectorsize < PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size " - "found on %s\n", sb->s_id); + if (sectorsize != PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } -- cgit v1.2.3 From 871383be592ba7e819d27556591e315a0df38cee Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 2 Apr 2012 18:31:37 +0200 Subject: btrfs: add missing unlocks to transaction abort paths Added in commit 49b25e0540904be0bf558b84475c69d72e4de66e ("btrfs: enhance transaction abort infrastructure") Reported-by: Dan Carpenter Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 11b77a59db6..36422254ef6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -73,8 +73,10 @@ loop: cur_trans = root->fs_info->running_transaction; if (cur_trans) { - if (cur_trans->aborted) + if (cur_trans->aborted) { + spin_unlock(&root->fs_info->trans_lock); return cur_trans->aborted; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -1400,6 +1402,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } @@ -1411,6 +1414,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } -- cgit v1.2.3 From 8e52acf70459020d7e9e9fda25066be4da520943 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 12 Mar 2012 16:39:28 +0800 Subject: Btrfs: retrurn void from clear_state_bit Currently it returns a set of bits that were cleared, but this return value is not used at all. Moreover it doesn't seem to be useful, because we may clear the bits of a few extent_states, but only the cleared bits of last one is returned. Signed-off-by: Li Zefan --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4789770f8ea..05951bdf72c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -404,18 +404,16 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). + * it will optionally wake up any one waiting on this state (wake == 1) * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static int clear_state_bit(struct extent_io_tree *tree, +static void clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int *bits, int wake) { int bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret = state->state & bits_to_clear; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -437,7 +435,6 @@ static int clear_state_bit(struct extent_io_tree *tree, } else { merge_state(tree, state); } - return ret; } static struct extent_state * -- cgit v1.2.3 From cdc6a3952558f00b1bc3b6401e1cf98797632fe2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 12 Mar 2012 16:39:48 +0800 Subject: Btrfs: avoid possible use-after-free in clear_extent_bit() clear_extent_bit() { next_node = rb_next(&state->rb_node); ... clear_state_bit(state); <-- this may free next_node if (next_node) { state = rb_entry(next_node); ... } } clear_state_bit() calls merge_state() which may free the next node of the passing extent_state, so clear_extent_bit() may end up referencing freed memory. Signed-off-by: Li Zefan --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05951bdf72c..11eeb81fe69 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -402,6 +402,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + /* * utility function to clear some bits in an extent state struct. * it will optionally wake up any one waiting on this state (wake == 1) @@ -409,10 +418,11 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static void clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - int *bits, int wake) +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + int *bits, int wake) { + struct extent_state *next; int bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -425,6 +435,7 @@ static void clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { + next = next_state(state); if (state->tree) { rb_erase(&state->rb_node, &tree->state); state->tree = NULL; @@ -434,7 +445,9 @@ static void clear_state_bit(struct extent_io_tree *tree, } } else { merge_state(tree, state); + next = next_state(state); } + return next; } static struct extent_state * @@ -473,7 +486,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; - struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; @@ -525,14 +537,11 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; - if (state->end < end && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) + if (!(state->state & bits)) { + state = next_state(state); goto next; + } /* * | ---- desired range ---- | @@ -590,16 +599,13 @@ hit_next: goto out; } - clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake); next: if (last_end == (u64)-1) goto out; start = last_end + 1; - if (start <= end && next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); + if (start <= end && state && !need_resched()) goto hit_next; - } goto search_again; out: -- cgit v1.2.3 From 4735fb282830c0966b301dabcccf4753fa6604bb Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 12 Apr 2012 22:47:52 +0200 Subject: Btrfs: Make free_ipath() deal gracefully with NULL pointers Make free_ipath() behave like most other freeing functions in the kernel and gracefully do nothing when passed a NULL pointer. Besides this making the bahaviour consistent with functions such as kfree(), vfree(), btrfs_free_path() etc etc, it also fixes a real NULL deref issue in fs/btrfs/ioctl.c::btrfs_ioctl_ino_to_path(). In that function we have this code: ... ipath = init_ipath(size, root, path); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; goto out; } ... out: btrfs_free_path(path); free_ipath(ipath); ... If we ever take the true branch of that 'if' statement we'll end up passing a NULL pointer to free_ipath() which will subsequently dereference it and we'll go "Boom" :-( This patch will avoid that. Signed-off-by: Jesper Juhl --- fs/btrfs/backref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f4e90748940..b332ff04c5e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1414,6 +1414,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + if (!ipath) + return; kfree(ipath->fspath); kfree(ipath); } -- cgit v1.2.3 From aefc1eb13ebbb86c5ffade8a9e2425cd71032d7e Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 13 Apr 2012 12:28:00 +0200 Subject: Btrfs: don't call free_extent_buffer twice in iterate_irefs Avoid calling free_extent_buffer more than once when the iterator function returns non-zero. The only code that uses this is scrub repair for corrupted nodatasum blocks. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b332ff04c5e..fb56bcc8037 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1247,7 +1247,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_path *path, iterate_irefs_t *iterate, void *ctx) { - int ret; + int ret = 0; int slot; u32 cur; u32 len; @@ -1259,7 +1259,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_inode_ref *iref; struct btrfs_key found_key; - while (1) { + while (!ret) { ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1288,10 +1288,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); - if (ret) { - free_extent_buffer(eb); + if (ret) break; - } len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } -- cgit v1.2.3 From b916a59adfdc875381b68ced258694b434cf43ae Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 13 Apr 2012 12:28:08 +0200 Subject: Btrfs: add missing read locks in backref.c iref_to_path and iterate_irefs both increment the eb's refcount to use it after releasing the path. Both depend on consistent data remaining in the extent buffer and need a read lock to protect it. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index fb56bcc8037..bcec0675023 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -22,6 +22,7 @@ #include "ulist.h" #include "transaction.h" #include "delayed-ref.h" +#include "locking.h" /* * this structure records all encountered refs on the way up to the root @@ -893,18 +894,22 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = size - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; if (bytes_left >= 0) dest[bytes_left] = '\0'; + path->leave_spinning = 1; while (1) { len = btrfs_inode_ref_name_len(eb, iref); bytes_left -= len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, (unsigned long)(iref + 1), len); - if (eb != eb_in) + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + } ret = inode_ref_info(parent, 0, fs_root, path, &found_key); if (ret > 0) ret = -ENOENT; @@ -919,8 +924,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, slot = path->slots[0]; eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ - if (eb != eb_in) + if (eb != eb_in) { atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); @@ -931,6 +939,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); + path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); @@ -1260,6 +1269,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { + path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1275,6 +1285,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(eb, slot); @@ -1293,6 +1305,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } -- cgit v1.2.3 From 37db63a400d3bac467795aa43901065fd8d903b7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 13 Apr 2012 17:05:08 +0300 Subject: Btrfs: fix max chunk size check in chunk allocator Fix a bug, where in case we need to adjust stripe_size so that the length of the resulting chunk is less than or equal to max_chunk_size, DUP chunks turn out to be only half as big as they could be. Cc: Arne Jansen Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 759d02486d7..ce289af526f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3324,12 +3324,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; num_stripes = ndevs * dev_stripes; - if (stripe_size * num_stripes > max_chunk_size * ncopies) { + if (stripe_size * ndevs > max_chunk_size * ncopies) { stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, num_stripes); + do_div(stripe_size, ndevs); } do_div(stripe_size, dev_stripes); + + /* align to BTRFS_STRIPE_LEN */ do_div(stripe_size, BTRFS_STRIPE_LEN); stripe_size *= BTRFS_STRIPE_LEN; -- cgit v1.2.3 From 8a3db1849e9e2563727ea2dc32737502e0096641 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Mon, 16 Apr 2012 06:44:37 +0300 Subject: btrfs: fix early abort in 'remount' Cc: Jeff Mahoney Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Sergei Trofimovich --- fs/btrfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 84571d7da12..43aa2dd0bc7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1152,13 +1152,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { - if (fs_info->fs_devices->rw_devices == 0) + if (fs_info->fs_devices->rw_devices == 0) { ret = -EACCES; goto restore; + } - if (btrfs_super_log_root(fs_info->super_copy) != 0) + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; + } ret = btrfs_cleanup_fs_roots(fs_info); if (ret) -- cgit v1.2.3 From 48d282326b3ce5f435835f5fb0e3231c399f4f9a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 14 Apr 2012 11:24:33 +0200 Subject: fs/btrfs/volumes.c: add missing free_fs_devices Free fs_devices as done in the error-handling code just below. Signed-off-by: Julia Lawall --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ce289af526f..3b984173d25 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4352,8 +4352,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) ret = __btrfs_open_devices(fs_devices, FMODE_READ, root->fs_info->bdev_holder); - if (ret) + if (ret) { + free_fs_devices(fs_devices); goto out; + } if (!fs_devices->seeding) { __btrfs_close_devices(fs_devices); -- cgit v1.2.3 From 5cf1ab56133ad7b712673c071b439d4a555a2d1e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Apr 2012 09:42:26 -0400 Subject: Btrfs: always store the mirror we read the eb from A user reported a panic where we were trying to fix a bad mirror but the mirror number we were giving was 0, which is invalid. This is because we don't do the transid verification until after the read, so as far as the read code is concerned the read was a success. So instead store the mirror we read from so that if there is some failure post read we know which mirror to try next and which mirror needs to be fixed if we find a good copy of the block. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 16 ++++++++-------- fs/btrfs/extent_io.c | 15 ++++++--------- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/inode.c | 2 +- 4 files changed, 17 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b9866f27ebd..d0c969beaad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -383,17 +383,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - if (!failed_mirror) { - failed = 1; - printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); - failed_mirror = eb->failed_mirror; - } - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) break; + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } + mirror_num++; if (mirror_num == failed_mirror) mirror_num++; @@ -564,7 +563,7 @@ struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, } static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { struct extent_io_tree *tree; u64 found_start; @@ -589,6 +588,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (!reads_done) goto err; + eb->read_mirror = mirror; if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; goto err; @@ -652,7 +652,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = failed_mirror; + eb->read_mirror = failed_mirror; if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 11eeb81fe69..0c23e57077c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2304,7 +2304,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; - int failed_mirror; + int mirror; int ret; if (err) @@ -2343,20 +2343,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); + mirror = (int)(unsigned long)bio->bi_bdev; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, - state); + state, mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate) - failed_mirror = (int)(unsigned long)bio->bi_bdev; - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + ret = tree->ops->readpage_io_failed_hook(page, mirror); if (!ret && !err && test_bit(BIO_UPTODATE, &bio->bi_flags)) uptodate = 1; @@ -2371,8 +2369,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + ret = bio_readpage_error(bio, page, start, end, mirror, NULL); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -4465,7 +4462,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = 0; + eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index faf10eb57f7..b516c3b8dec 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,7 +79,7 @@ struct extent_io_ops { u64 start, u64 end, struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state); + struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, @@ -135,7 +135,7 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; - int failed_mirror; + int read_mirror; struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98ee5a51aa2..d953f882046 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1947,7 +1947,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); struct inode *inode = page->mapping->host; -- cgit v1.2.3 From 253beebd5a255e07d6a8b65515491f33664e82a2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 18 Apr 2012 09:59:03 +0300 Subject: Btrfs: double unlock bug in error handling The caller expects this function to return with the lock held and releases it immediately on error. Signed-off-by: Dan Carpenter --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2b35f8d14bb..a0bb9dcd3c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2301,6 +2301,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + spin_lock(&delayed_refs->lock); return ret; } @@ -2331,6 +2332,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + spin_lock(&delayed_refs->lock); return ret; } -- cgit v1.2.3 From b9688bb8459b67e42327de6420edb405a9188775 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 18 Apr 2012 10:27:16 +0200 Subject: btrfs: don't return EINTR It is basically a good thing if we are interruptible when waiting for free space, but the generality in which it is implemented currently leads to system calls being interruptible that are not documented this way. For example git can't handle interrupted unlink(), leading to corrupt repos under space pressure. Instead we raise the bar to only be interruptible by SIGKILL. Thanks to David Sterba for suggesting this. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a0bb9dcd3c3..84497f8eb04 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3771,13 +3771,10 @@ again: */ if (current->journal_info) return -EAGAIN; - ret = wait_event_interruptible(space_info->wait, - !space_info->flush); - /* Must have been interrupted, return */ - if (ret) { - printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__); + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ + if (ret) return -EINTR; - } spin_lock(&space_info->lock); } -- cgit v1.2.3 From 99ba55ad696944b37d5557bc5b4816890854fdb9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 19 Mar 2012 16:17:22 +0100 Subject: Btrfs: fix btrfs_ioctl_dev_info() crash on missing device When a filesystem is mounted with the degraded option, it is possible that some of the devices are not there. btrfs_ioctl_dev_info() crashs in this case because the device name is a NULL pointer. This ioctl was only used for scrub. Signed-off-by: Stefan Behrens --- fs/btrfs/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 18cc23d164a..14f8e1faa46 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2262,7 +2262,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + if (dev->name) + strncpy(di_args->path, dev->name, sizeof(di_args->path)); + else + di_args->path[0] = '\0'; out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) -- cgit v1.2.3 From 5c84fc3c3914e9adfa6155a167c6c0c2709e6a62 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 30 Mar 2012 13:58:31 +0200 Subject: Btrfs: don't count CRC or header errors twice while scrubbing Each CRC or header error was counted twice, this is now fixed. Signed-off-by: Stefan Behrens --- fs/btrfs/scrub.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 60f0e28db31..b679bf68861 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1258,12 +1258,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - if (fail) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); - } - return fail; } @@ -1336,15 +1330,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++crc_fail; - if (crc_fail || fail) { - spin_lock(&sdev->stat_lock); - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); - } - return fail || crc_fail; } -- cgit v1.2.3 From 25cd999e1a685dab65292afbe9fa24d790d8a859 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 30 Mar 2012 13:58:32 +0200 Subject: Btrfs: fix that check_int_data mount option was ignored The bitfield member mount_opt was too small by one bit to hold the mount option that enabled to include data extents in the integrity checker. Since the same issue happened when the BTRFS_MOUNT_PANIC_ON_FATAL_ERROR option was added (git rebase silently merges so that the increase of the size of the bitfield member is lost), the bit limit was removed entirely. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5b8ef8eb352..ec42a24e935 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1078,7 +1078,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:21; + unsigned long mount_opt; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; -- cgit v1.2.3 From 996d282c7ff470f150a467eb4815b90159d04c47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 20:35:03 -0400 Subject: Btrfs: do not start delalloc inodes during sync btrfs_start_delalloc_inodes will just walk the list of delalloc inodes and start writing them out, but it doesn't splice the list or anything so as long as somebody is doing work on the box you could end up in this section _forever_. So just remove it, it's not needed anyway since sync will start writeback on all inodes anyway, all we need to do is wait for ordered extents and then we can commit the transaction. In my horrible torture test sync goes from taking 4 minutes to about 1.5 minutes. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 43aa2dd0bc7..f267718cbd1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -819,7 +819,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); -- cgit v1.2.3 From 3e74317ad773ba9df36db1fa32848cba41ac4d1a Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 27 Apr 2012 12:41:45 -0400 Subject: Btrfs: fix repair code for RAID10 btrfs_map_block sets mirror_num, so that the repair code knows eventually which device gave us the read error. For RAID10, mirror_num must be 1 or 2. Before this fix mirror_num was incorrectly related to our stripe index. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b984173d25..1411b99555a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3807,10 +3807,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, else if (mirror_num) stripe_index += mirror_num - 1; else { + int old_stripe_index = stripe_index; stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); - mirror_num = stripe_index + 1; + mirror_num = stripe_index - old_stripe_index + 1; } } else { /* -- cgit v1.2.3 From 1daf3540fa77faea2f91d96bcaf07ce48ee827be Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: Btrfs: Prevent root_list corruption I was seeing root_list corruption on unmount during fs resize in 3.4-rc4; add correct locking to address this. Signed-off-by: Daniel J Blueman Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 017281dbb2a..5a105a086ac 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1279,7 +1279,9 @@ static int __update_reloc_root(struct btrfs_root *root, int del) if (rb_node) backref_tree_panic(rb_node, -EEXIST, node->bytenr); } else { + spin_lock(&root->fs_info->trans_lock); list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); kfree(node); } return 0; -- cgit v1.2.3 From 1f699d38b6556c393ac80f1c23c2053502a51631 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: Btrfs: fix block_rsv and space_info lock ordering may_commit_transaction() calls spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); and update_global_block_rsv() calls spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); Lockdep complains about this at run time. Everywhere except in update_global_block_rsv(), the space_info lock is the outer lock, therefore the locking order in update_global_block_rsv() is changed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84497f8eb04..6fc2e6f5aab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4214,8 +4214,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = calc_global_metadata_size(fs_info); - spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); block_rsv->size = num_bytes; @@ -4241,8 +4241,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->full = 1; } - spin_unlock(&sinfo->lock); spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); } static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 7654b72417e10e294563496e25211200f9b8b6d3 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: Btrfs: Fix space checking during fs resize Fix out-of-space checking, addressing a warning and potential resource leak when resizing the filesystem down while allocating blocks. Signed-off-by: Daniel J Blueman Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5a105a086ac..646ee21bb03 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3813,7 +3813,7 @@ restart: ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { - if (ret != -EAGAIN) { + if (ret != -ENOSPC) { err = ret; WARN_ON(1); break; -- cgit v1.2.3 From fede766f28dd766d4e8feb321fdb19edb21ef6fb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 27 Apr 2012 14:23:22 -0400 Subject: Btrfs: avoid deadlocks from GFP_KERNEL allocations during btrfs_real_readdir Btrfs has an optimization where it will preallocate dentries during readdir to fill in enough information to open the inode without an extra lookup. But, we're calling d_alloc, which is doing GFP_KERNEL allocations, and that leads to deadlocks because our readdir code has tree locks held. For now, disable this optimization. We'll fix the gfp mask in the next merge window. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d953f882046..3ce7805d111 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4192,7 +4192,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; - struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4283,7 +4282,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; - struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4304,33 +4302,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - q.name = name_ptr; - q.len = name_len; - q.hash = full_name_hash(q.name, q.len); - tmp = d_lookup(filp->f_dentry, &q); - if (!tmp) { - struct btrfs_key *newkey; - - newkey = kzalloc(sizeof(struct btrfs_key), - GFP_NOFS); - if (!newkey) - goto no_dentry; - tmp = d_alloc(filp->f_dentry, &q); - if (!tmp) { - kfree(newkey); - dput(tmp); - goto no_dentry; - } - memcpy(newkey, &location, - sizeof(struct btrfs_key)); - tmp->d_fsdata = newkey; - tmp->d_flags |= DCACHE_NEED_LOOKUP; - d_rehash(tmp); - dput(tmp); - } else { - dput(tmp); - } -no_dentry: + /* is this a reference to our own snapshot? If so * skip it. * -- cgit v1.2.3 From dc7fdde39e4962b1a88741f7eba2a6b3be1285d8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 27 Apr 2012 14:31:29 -0400 Subject: Btrfs: reduce lock contention during extent insertion We're spending huge amounts of time on lock contention during end_io processing because we unconditionally assume we are overwriting an existing extent in the file for each IO. This checks to see if we are outside i_size, and if so, it uses a less expensive readonly search of the btree to look for existing extents. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d83260d7498..53bf2d764bb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -567,6 +567,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int extent_type; int recow; int ret; + int modify_tree = -1; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -575,10 +576,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, if (!path) return -ENOMEM; + if (start >= BTRFS_I(inode)->disk_i_size) + modify_tree = 0; + while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, - search_start, -1); + search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == start) { @@ -634,7 +638,8 @@ next_slot: } search_start = max(key.offset, start); - if (recow) { + if (recow || !modify_tree) { + modify_tree = -1; btrfs_release_path(path); continue; } -- cgit v1.2.3 From e5846fc665d1c3dd32d877febe7402ccd583b8a1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 May 2012 12:08:48 -0400 Subject: Btrfs: Add properly locking around add_root_to_dirty_list add_root_to_dirty_list happens once at the very beginning of the transaction, but it is still racey. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e801f226d7e..086303b9be6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -220,10 +220,12 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + spin_lock(&root->fs_info->trans_lock); if (root->track_dirty && list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } + spin_unlock(&root->fs_info->trans_lock); } /* -- cgit v1.2.3 From 17de39ac17bf99b8bf0d819d13668d5048836efc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 May 2012 15:16:06 -0400 Subject: Btrfs: fix page leak when allocing extent buffers If we happen to alloc a extent buffer and then alloc a page and notice that page is already attached to an extent buffer, we will only unlock it and free our existing eb. Any pages currently attached to that eb will be properly freed, but we don't do the page_cache_release() on the page where we noticed the other extent buffer which can cause us to leak pages and I hope cause the weird issues we've been seeing in this area. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0c23e57077c..2fb52c26c67 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4120,6 +4120,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (atomic_inc_not_zero(&exists->refs)) { spin_unlock(&mapping->private_lock); unlock_page(p); + page_cache_release(p); mark_extent_buffer_accessed(exists); goto free_eb; } @@ -4199,8 +4200,7 @@ free_eb: unlock_page(eb->pages[i]); } - if (!atomic_dec_and_test(&eb->refs)) - return exists; + WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } -- cgit v1.2.3 From d04b1debc92535453df2494d0b019edf0bb91003 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Fri, 4 May 2012 15:16:06 -0400 Subject: btrfs: Fix mismatching struct members in ioctl.h Fix the size members of btrfs_ioctl_ino_path_args and btrfs_ioctl_logical_ino_args. The user space btrfs-progs utilities used __u64 and the kernel headers used __u32 before. Signed-off-by: Alexander Block Signed-off-by: Chris Mason --- fs/btrfs/ioctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 4f69028a68c..086e6bdae1c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -252,7 +252,7 @@ struct btrfs_data_container { struct btrfs_ioctl_ino_path_args { __u64 inum; /* in */ - __u32 size; /* in */ + __u64 size; /* in */ __u64 reserved[4]; /* struct btrfs_data_container *fspath; out */ __u64 fspath; /* out */ @@ -260,7 +260,7 @@ struct btrfs_ioctl_ino_path_args { struct btrfs_ioctl_logical_ino_args { __u64 logical; /* in */ - __u32 size; /* in */ + __u64 size; /* in */ __u64 reserved[4]; /* struct btrfs_data_container *inodes; out */ __u64 inodes; -- cgit v1.2.3 From ea9947b4395fa34666086b2fa6f686e94903e047 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 4 May 2012 15:16:07 -0400 Subject: Btrfs: fix crash in scrub repair code when device is missing Fix that when scrub tries to repair an I/O or checksum error and one of the devices containing the mirror is missing, it crashes in bio_add_page because the bdev is a NULL pointer for missing devices. Reported-by: Marco L. Crociani Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b679bf68861..7e487be0094 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -998,6 +998,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page = sblock->pagev + page_index; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; + /* for missing devices, bdev is NULL */ page->bdev = bbio->stripes[mirror_index].dev->bdev; page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); @@ -1042,6 +1043,12 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_page *page = sblock->pagev + page_num; DECLARE_COMPLETION_ONSTACK(complete); + if (page->bdev == NULL) { + page->io_error = 1; + sblock->no_io_error_seen = 0; + continue; + } + BUG_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); if (!bio) -- cgit v1.2.3 From dbd5768f87ff6fb0a4fe09c4d7b6c4a24de99430 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 May 2012 14:48:02 +0200 Subject: vfs: Rename end_writeback() to clear_inode() After we moved inode_sync_wait() from end_writeback() it doesn't make sense to call the function end_writeback() anymore. Rename it to clear_inode() which well says what the function really does - set I_CLEAR flag. Signed-off-by: Jan Kara Signed-off-by: Fengguang Wu --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 115bc05e42b..5c058c4d328 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3756,7 +3756,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); no_delete: - end_writeback(inode); + clear_inode(inode); return; } -- cgit v1.2.3 From b9fab919b748c7b39c19ff236ed6c5682c266dde Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 6 May 2012 07:23:47 -0400 Subject: Btrfs: avoid sleeping in verify_parent_transid while atomic verify_parent_transid needs to lock the extent range to make sure no IO is underway, and so it can safely clear the uptodate bits if our checks fail. But, a few callers are using it with spinlocks held. Most of the time, the generation numbers are going to match, and we don't want to switch to a blocking lock just for the error case. This adds an atomic flag to verify_parent_transid, and changes it to return EAGAIN if it needs to block to properly verifiy things. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 26 +++++++++++++++++--------- fs/btrfs/disk-io.c | 18 +++++++++++++----- fs/btrfs/disk-io.h | 3 ++- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/tree-log.c | 2 +- 5 files changed, 34 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 086303b9be6..4106264fbc6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -725,7 +725,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen); + uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { @@ -1360,7 +1360,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); eb = btrfs_find_tree_block(root, block1, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + /* + * if we get -eagain from btrfs_buffer_uptodate, we + * don't want to return eagain here. That will loop + * forever + */ + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block1 = 0; free_extent_buffer(eb); } @@ -1368,7 +1373,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } @@ -1506,8 +1511,9 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { - if (btrfs_buffer_uptodate(tmp, 0)) { - if (btrfs_buffer_uptodate(tmp, gen)) { + /* first we do an atomic uptodate check */ + if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* * we found an up to date block without * sleeping, return @@ -1525,8 +1531,9 @@ read_block_for_search(struct btrfs_trans_handle *trans, free_extent_buffer(tmp); btrfs_set_path_blocking(p); + /* now we're allowed to do a blocking uptodate check */ tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { *eb_ret = tmp; return 0; } @@ -1561,7 +1568,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!btrfs_buffer_uptodate(tmp, 0)) + if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); } @@ -4045,7 +4052,7 @@ again: tmp = btrfs_find_tree_block(root, blockptr, btrfs_level_size(root, level - 1)); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { free_extent_buffer(tmp); break; } @@ -4168,7 +4175,8 @@ next: struct extent_buffer *cur; cur = btrfs_find_tree_block(root, blockptr, btrfs_level_size(root, level - 1)); - if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + if (!cur || + btrfs_buffer_uptodate(cur, gen, 1) <= 0) { slot++; if (cur) free_extent_buffer(cur); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d0c969beaad..a7ffc88a7db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -323,7 +323,8 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, * in the wrong place. */ static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid) + struct extent_buffer *eb, u64 parent_transid, + int atomic) { struct extent_state *cached_state = NULL; int ret; @@ -331,6 +332,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; + if (atomic) + return -EAGAIN; + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state); if (extent_buffer_uptodate(eb) && @@ -372,7 +376,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) + if (!ret && !verify_parent_transid(io_tree, eb, + parent_transid, 0)) break; /* @@ -1202,7 +1207,7 @@ static int __must_check find_and_setup_root(struct btrfs_root *tree_root, root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { + if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) { free_extent_buffer(root->node); root->node = NULL; return -EIO; @@ -3143,7 +3148,8 @@ int close_ctree(struct btrfs_root *root) return 0; } -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic) { int ret; struct inode *btree_inode = buf->pages[0]->mapping->host; @@ -3153,7 +3159,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) return ret; ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid); + parent_transid, atomic); + if (ret == -EAGAIN) + return ret; return !ret; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a7ace1a2dd1..ab1830aaf0e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -66,7 +66,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + int atomic); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6fc2e6f5aab..49fd7b66d57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6568,7 +6568,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, goto skip; } - if (!btrfs_buffer_uptodate(next, generation)) { + if (!btrfs_buffer_uptodate(next, generation, 0)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d017283ae6f..eb1ae908582 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -279,7 +279,7 @@ static int process_one_buffer(struct btrfs_root *log, log->fs_info->extent_root, eb->start, eb->len); - if (btrfs_buffer_uptodate(eb, gen)) { + if (btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) -- cgit v1.2.3 From f775738f6fba9c7f6deaa540860d6fb7e2d28445 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 30 Mar 2012 15:14:27 +0800 Subject: btrfs/ctree.c: remove the unnecessary 'return -1;' at the end of bin_search The code path should not reach there. Remove it. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/ctree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4106264fbc6..26847999c64 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -854,20 +854,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb, static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot) { - if (level == 0) { + if (level == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), key, btrfs_header_nritems(eb), slot); - } else { + else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), sizeof(struct btrfs_key_ptr), key, btrfs_header_nritems(eb), slot); - } - return -1; } int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, -- cgit v1.2.3 From 1b303fc0545b4bfbb2b8a69eec89e6f077f69b56 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:18 +0800 Subject: Btrfs: cleanup the comment for clear_state_bit in extent_io.c No 'delete' arg is used for clear_state_bit. Cleanup the comment. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9018a05036..aeb98ceda51 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -413,7 +413,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1) + * it will optionally wake up any one waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree -- cgit v1.2.3 From 39bab87ba6f4d8cce2b70c19e60233ad8030d7b4 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:31 +0800 Subject: Btrfs: fix btrfs_release_extent_buffer_page with the right usage of num_extent_pages num_extent_pages returns the number of pages in the specific range, not the index of the last page in the eb range. btrfs_release_extent_buffer_page is called with start_idx set 0 in current codes, so it's not a problem yet. But the logic is indeed wrong. Fix it here. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aeb98ceda51..455daec2b6c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3981,11 +3981,13 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long start_idx) { unsigned long index; + unsigned long num_pages; struct page *page; BUG_ON(extent_buffer_under_io(eb)); - index = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; if (start_idx >= index) return; -- cgit v1.2.3 From 477d7eafa9585ded87ee1c6f69638a6baf9d8922 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:47 +0800 Subject: Btrfs: fix the comment for find_first_extent_bit The return value of find_first_extent_bit is 1 or 0, no < 0. And if found something, return 0; if nothing was found, return 1. Fix the comment. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 455daec2b6c..fe91305e12f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1293,7 +1293,7 @@ out: * returned if we find something, and *start_ret and *end_ret are * set to reflect the state struct that was found. * - * If nothing was found, 1 is returned, < 0 on error + * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, int bits) -- cgit v1.2.3 From fd5e62a37cef5b212318c522eac0ecd394b50a19 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 6 Apr 2012 14:35:05 +0800 Subject: Btrfs: remove the useless assignment to *entry in function tree_insert of file extent_io.c In tree_insert, var *entry is used in the loop only, and is useless out of the loop. Remove the useless assignment after the loop. Signed-off-by: Wang Sheng-Hui --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe91305e12f..82b4829f00b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; -- cgit v1.2.3 From e06baab4184509bdfddd294efc6cae7a410c6f07 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 12 Apr 2012 12:53:40 +0200 Subject: Btrfs: change integrity checker to support big blocks The integrity checker used to be coded for nodesize == leafsize == sectorsize == PAGE_CACHE_SIZE. This is now changed to support sizes for nodesize and leafsize which are N * PAGE_CACHE_SIZE. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 561 +++++++++++++++++++++++++++++++++------------ 1 file changed, 415 insertions(+), 146 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index c053e90f200..7f6cc359e44 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -103,8 +103,6 @@ #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, * excluding " [...]" */ -#define BTRFSIC_BLOCK_SIZE PAGE_SIZE - #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) /* @@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx { u64 dev_bytenr; /* physical bytenr on device */ u32 len; struct btrfsic_dev_state *dev; - char *data; - struct buffer_head *bh; /* do not use if set to NULL */ + char **datav; + struct page **pagev; + void *mem_to_free; }; /* This structure is used to implement recursion without occupying @@ -243,6 +242,8 @@ struct btrfsic_state { struct btrfs_root *root; u64 max_superblock_generation; struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; }; static void btrfsic_block_init(struct btrfsic_block *b); @@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, static int btrfsic_process_metablock(struct btrfsic_state *state, struct btrfsic_block *block, struct btrfsic_block_data_ctx *block_ctx, - struct btrfs_header *hdr, int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); static int btrfsic_create_link_to_next_block( struct btrfsic_state *state, struct btrfsic_block *block, @@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size); + char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, u8 *mapped_data, - unsigned int len, struct bio *bio, - int *bio_is_patched, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, struct buffer_head *bh, int submit_bio_bh_rw); static int btrfsic_process_written_superblock( @@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data); + u64 dev_bytenr); static struct mutex btrfsic_mutex; static int btrfsic_is_initialized; @@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int pass; BUG_ON(NULL == state); - selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); return -1; @@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfsic_block *next_block; struct btrfsic_block_data_ctx tmp_next_block_ctx; struct btrfsic_block_link *l; - struct btrfs_header *hdr; - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, &tmp_next_block_ctx, mirror_num); if (ret) { @@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, BUG_ON(NULL == l); ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { + if (ret < (int)PAGE_CACHE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", (unsigned long long) @@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, return -1; } - hdr = (struct btrfs_header *)tmp_next_block_ctx.data; ret = btrfsic_process_metablock(state, next_block, &tmp_next_block_ctx, - hdr, BTRFS_MAX_LEVEL + 3, 1); btrfsic_release_block_ctx(&tmp_next_block_ctx); } @@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + return -1; + bh = __bread(superblock_bdev, dev_bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); if (NULL == bh) return -1; super_tmp = (struct btrfs_super_block *) @@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror( if (btrfs_super_bytenr(super_tmp) != dev_bytenr || strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, sizeof(super_tmp->magic)) || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_leafsize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { brelse(bh); return 0; } @@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror( num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfsic_block_data_ctx tmp_next_block_ctx; struct btrfsic_block_link *l; - if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, &tmp_next_block_ctx, mirror_num)) { printk(KERN_INFO "btrfsic: btrfsic_map_block(" @@ -966,13 +975,15 @@ static int btrfsic_process_metablock( struct btrfsic_state *state, struct btrfsic_block *const first_block, struct btrfsic_block_data_ctx *const first_block_ctx, - struct btrfs_header *const first_hdr, int first_limit_nesting, int force_iodone_flag) { struct btrfsic_stack_frame initial_stack_frame = { 0 }; struct btrfsic_stack_frame *sf; struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + BUG_ON(!first_hdr); sf = &initial_stack_frame; sf->error = 0; sf->i = -1; @@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame: } if (sf->i < sf->nr) { - struct btrfs_item *disk_item = leafhdr->items + sf->i; - struct btrfs_disk_key *disk_key = &disk_item->key; + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; u8 type; - const u32 item_offset = le32_to_cpu(disk_item->offset); + u32 item_offset; + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + printk(KERN_INFO + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = le32_to_cpu(disk_item.offset); + disk_key = &disk_item.key; type = disk_key->type; if (BTRFS_ROOT_ITEM_KEY == type) { - const struct btrfs_root_item *const root_item = - (struct btrfs_root_item *) - (sf->block_ctx->data + - offsetof(struct btrfs_leaf, items) + - item_offset); - const u64 next_bytenr = - le64_to_cpu(root_item->bytenr); + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + + sizeof(struct btrfs_root_item) > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + sizeof(struct btrfs_root_item)); + next_bytenr = le64_to_cpu(root_item.bytenr); sf->error = btrfsic_create_link_to_next_block( @@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame: &sf->num_copies, &sf->mirror_num, disk_key, - le64_to_cpu(root_item-> + le64_to_cpu(root_item. generation)); if (sf->error) goto one_stack_frame_backwards; @@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame: if (NULL != sf->next_block) { struct btrfs_header *const next_hdr = (struct btrfs_header *) - sf->next_block_ctx.data; + sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); @@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame: } if (sf->i < sf->nr) { - struct btrfs_key_ptr *disk_key_ptr = - nodehdr->ptrs + sf->i; - const u64 next_bytenr = - le64_to_cpu(disk_key_ptr->blockptr); + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + printk(KERN_INFO + "btrfsic: node item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = le64_to_cpu(key_ptr.blockptr); sf->error = btrfsic_create_link_to_next_block( state, @@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame: force_iodone_flag, &sf->num_copies, &sf->mirror_num, - &disk_key_ptr->key, - le64_to_cpu(disk_key_ptr->generation)); + &key_ptr.key, + le64_to_cpu(key_ptr.generation)); if (sf->error) goto one_stack_frame_backwards; if (NULL != sf->next_block) { struct btrfs_header *const next_hdr = (struct btrfs_header *) - sf->next_block_ctx.data; + sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) @@ -1181,6 +1232,35 @@ one_stack_frame_backwards: return sf->error; } +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t offset_in_page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + offset_in_page = (start_offset + offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); + BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + offset_in_page, cur); + + dst += cur; + len -= cur; + offset_in_page = 0; + i++; + } +} + static int btrfsic_create_link_to_next_block( struct btrfsic_state *state, struct btrfsic_block *block, @@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block( if (0 == *num_copiesp) { *num_copiesp = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, *num_copiesp); @@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block( "btrfsic_create_link_to_next_block(mirror_num=%d)\n", *mirror_nump); ret = btrfsic_map_block(state, next_bytenr, - BTRFSIC_BLOCK_SIZE, + state->metablock_size, next_block_ctx, *mirror_nump); if (ret) { printk(KERN_INFO @@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block( if (limit_nesting > 0 && did_alloc_block_link) { ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { + if (ret < (int)next_block_ctx->len) { printk(KERN_INFO "btrfsic: read block @logical %llu failed!\n", (unsigned long long)next_bytenr); @@ -1339,43 +1419,55 @@ static int btrfsic_handle_extent_data( u32 item_offset, int force_iodone_flag) { int ret; - struct btrfs_file_extent_item *file_extent_item = - (struct btrfs_file_extent_item *)(block_ctx->data + - offsetof(struct btrfs_leaf, - items) + item_offset); - u64 next_bytenr = - le64_to_cpu(file_extent_item->disk_bytenr) + - le64_to_cpu(file_extent_item->offset); - u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); - u64 generation = le64_to_cpu(file_extent_item->generation); + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; struct btrfsic_block_link *l; + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + + le64_to_cpu(file_extent_item.offset); + generation = le64_to_cpu(file_extent_item.generation); + num_bytes = le64_to_cpu(file_extent_item.num_bytes); + generation = le64_to_cpu(file_extent_item.generation); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," " offset = %llu, num_bytes = %llu\n", - file_extent_item->type, - (unsigned long long) - le64_to_cpu(file_extent_item->disk_bytenr), - (unsigned long long) - le64_to_cpu(file_extent_item->offset), + file_extent_item.type, (unsigned long long) - le64_to_cpu(file_extent_item->num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || - ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) + le64_to_cpu(file_extent_item.disk_bytenr), + (unsigned long long)le64_to_cpu(file_extent_item.offset), + (unsigned long long)num_bytes); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) return 0; while (num_bytes > 0) { u32 chunk_len; int num_copies; int mirror_num; - if (num_bytes > BTRFSIC_BLOCK_SIZE) - chunk_len = BTRFSIC_BLOCK_SIZE; + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; else chunk_len = num_bytes; num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -1475,8 +1567,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; if (0 == ret) kfree(multi); @@ -1496,8 +1589,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, block_ctx_out->dev_bytenr = bytenr; block_ctx_out->start = bytenr; block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; if (NULL != block_ctx_out->dev) { return 0; } else { @@ -1508,38 +1602,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) { - if (NULL != block_ctx->bh) { - brelse(block_ctx->bh); - block_ctx->bh = NULL; + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) { + kunmap(block_ctx->pagev[num_pages]); + block_ctx->datav[num_pages] = NULL; + } + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; } } static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx) { - block_ctx->bh = NULL; - if (block_ctx->dev_bytenr & 4095) { + unsigned int num_pages; + unsigned int i; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", (unsigned long long)block_ctx->dev_bytenr); return -1; } - if (block_ctx->len > 4096) { - printk(KERN_INFO - "btrfsic: read_block() with too huge size %d\n", - block_ctx->len); + + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev)) * + num_pages, GFP_NOFS); + if (!block_ctx->mem_to_free) return -1; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + for (i = 0; i < num_pages; i++) { + block_ctx->pagev[i] = alloc_page(GFP_NOFS); + if (!block_ctx->pagev[i]) + return -1; } - block_ctx->bh = __bread(block_ctx->dev->bdev, - block_ctx->dev_bytenr >> 12, 4096); - if (NULL == block_ctx->bh) - return -1; - block_ctx->data = block_ctx->bh->b_data; + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, num_pages - i); + if (!bio) { + printk(KERN_INFO + "btrfsic: bio_alloc() for %u pages failed!\n", + num_pages - i); + return -1; + } + bio->bi_bdev = block_ctx->dev->bdev; + bio->bi_sector = dev_bytenr >> 9; + bio->bi_end_io = btrfsic_complete_bio_end_io; + bio->bi_private = &complete; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_CACHE_SIZE, 0); + if (PAGE_CACHE_SIZE != ret) + break; + } + if (j == i) { + printk(KERN_INFO + "btrfsic: error, failed to add a single page!\n"); + return -1; + } + submit_bio(READ, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_INFO + "btrfsic: read error at logical %llu dev %s!\n", + block_ctx->start, block_ctx->dev->name); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) { + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + if (!block_ctx->datav[i]) { + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", + block_ctx->dev->name); + return -1; + } + } return block_ctx->len; } +static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + static void btrfsic_dump_database(struct btrfsic_state *state) { struct list_head *elem_all; @@ -1617,32 +1800,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state) * (note that this test fails for the super block) */ static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size) + char **datav, unsigned int num_pages) { struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; + unsigned int i; - h = (struct btrfs_header *)data; + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + h = (struct btrfs_header *)datav[0]; if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) - fail++; + return 1; - crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_CACHE_SIZE : + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + + crc = crc32c(crc, data, sublen); + } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) - crc_fail++; + return 1; - return fail || crc_fail; + return 0; /* is metadata */ } static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, - u8 *mapped_data, unsigned int len, - struct bio *bio, - int *bio_is_patched, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, struct buffer_head *bh, int submit_bio_bh_rw) { @@ -1652,12 +1842,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, int ret; struct btrfsic_state *state = dev_state->state; struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; - WARN_ON(len > PAGE_SIZE); - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); if (NULL != bio_is_patched) *bio_is_patched = 0; +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, &state->block_hashtable); if (NULL != block) { @@ -1667,8 +1864,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (block->is_superblock) { bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); + if (num_pages * PAGE_CACHE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } is_metadata = 1; + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { printk(KERN_INFO @@ -1678,12 +1883,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } if (is_metadata) { if (!block->is_superblock) { + if (num_pages * PAGE_CACHE_SIZE < + state->metablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); + dev_bytenr); } if (block->logical_bytenr != bytenr) { printk(KERN_INFO @@ -1710,6 +1921,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block->mirror_num, btrfsic_get_block_type(state, block)); } else { + if (num_pages * PAGE_CACHE_SIZE < + state->datablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; bytenr = block->logical_bytenr; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO @@ -1747,7 +1965,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, le64_to_cpu(block->disk_key.offset), (unsigned long long) le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation), + mapped_datav[0])->generation), (unsigned long long) state->max_superblock_generation); btrfsic_dump_tree(state); @@ -1765,10 +1983,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, (unsigned long long)block->generation, (unsigned long long) le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation)); + mapped_datav[0])->generation)); /* it would not be safe to go on */ btrfsic_dump_tree(state); - return; + goto continue_loop; } /* @@ -1796,18 +2014,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, len, + ret = btrfsic_map_superblock(state, bytenr, + processed_len, bdev, &block_ctx); else - ret = btrfsic_map_block(state, bytenr, len, + ret = btrfsic_map_block(state, bytenr, processed_len, &block_ctx, 0); if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", (unsigned long long)bytenr); - return; + goto continue_loop; } - block_ctx.data = mapped_data; + block_ctx.datav = mapped_datav; /* the following is required in case of writes to mirrors, * use the same that was used for the lookup */ block_ctx.dev = dev_state; @@ -1863,11 +2082,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block->logical_bytenr = bytenr; block->is_metadata = 1; if (block->is_superblock) { + BUG_ON(PAGE_CACHE_SIZE != + BTRFS_SUPER_INFO_SIZE); ret = btrfsic_process_written_superblock( state, block, (struct btrfs_super_block *) - mapped_data); + mapped_datav[0]); if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { printk(KERN_INFO @@ -1880,8 +2101,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, state, block, &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); } if (ret) @@ -1912,26 +2131,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 bytenr; if (!is_metadata) { + processed_len = state->datablock_size; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block (%s/%llu/?)" " !found in hash table, D.\n", dev_state->name, (unsigned long long)dev_bytenr); - if (!state->include_extent_data) - return; /* ignore that written D block */ + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } /* this is getting ugly for the * include_extent_data case... */ bytenr = 0; /* unknown */ block_ctx.start = bytenr; - block_ctx.len = len; - block_ctx.bh = NULL; + block_ctx.len = processed_len; + block_ctx.mem_to_free = NULL; + block_ctx.pagev = NULL; } else { + processed_len = state->metablock_size; bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); + dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/?)" @@ -1940,17 +2163,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, dev_state->name, (unsigned long long)dev_bytenr); - ret = btrfsic_map_block(state, bytenr, len, &block_ctx, - 0); + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", (unsigned long long)dev_bytenr); - return; + goto continue_loop; } } - block_ctx.data = mapped_data; + block_ctx.datav = mapped_datav; /* the following is required in case of writes to mirrors, * use the same that was used for the lookup */ block_ctx.dev = dev_state; @@ -1960,7 +2183,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (NULL == block) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&block_ctx); - return; + goto continue_loop; } block->dev_state = dev_state; block->dev_bytenr = dev_bytenr; @@ -2020,9 +2243,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (is_metadata) { ret = btrfsic_process_metablock(state, block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); + &block_ctx, 0, 0); if (ret) printk(KERN_INFO "btrfsic: process_metablock(root @%llu)" @@ -2031,6 +2252,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } btrfsic_release_block_ctx(&block_ctx); } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; + num_pages -= processed_len >> PAGE_CACHE_SHIFT; + goto again; } static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) @@ -2213,7 +2441,7 @@ static int btrfsic_process_written_superblock( num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -2224,7 +2452,8 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic_process_written_superblock(" "mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, &tmp_next_block_ctx, mirror_num); if (ret) { @@ -2689,7 +2918,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data) + u64 dev_bytenr) { int num_copies; int mirror_num; @@ -2698,10 +2927,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, int match = 0; num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - bytenr, PAGE_SIZE); + bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, bytenr, state->metablock_size, &block_ctx, mirror_num); if (ret) { printk(KERN_INFO "btrfsic:" @@ -2727,7 +2956,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, (unsigned long long)bytenr, dev_state->name, (unsigned long long)dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, &block_ctx, mirror_num); if (ret) continue; @@ -2781,13 +3011,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) (unsigned long)bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, - bh->b_data, bh->b_size, NULL, + &bh->b_data, 1, NULL, NULL, bh, rw); } else if (NULL != dev_state && (rw & REQ_FLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", rw, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & @@ -2836,6 +3066,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) unsigned int i; u64 dev_bytenr; int bio_is_patched; + char **mapped_datav; dev_bytenr = 512 * bio->bi_sector; bio_is_patched = 0; @@ -2848,35 +3079,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)dev_bytenr, bio->bi_bdev); + mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, + GFP_NOFS); + if (!mapped_datav) + goto leave; for (i = 0; i < bio->bi_vcnt; i++) { - u8 *mapped_data; - - mapped_data = kmap(bio->bi_io_vec[i].bv_page); + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); + if (!mapped_datav[i]) { + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + goto leave; + } if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE) == (dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE))) printk(KERN_INFO - "#%u: page=%p, mapped=%p, len=%u," - " offset=%u\n", + "#%u: page=%p, len=%u, offset=%u\n", i, bio->bi_io_vec[i].bv_page, - mapped_data, bio->bi_io_vec[i].bv_len, bio->bi_io_vec[i].bv_offset); - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio, &bio_is_patched, - NULL, rw); + } + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_datav, bio->bi_vcnt, + bio, &bio_is_patched, + NULL, rw); + while (i > 0) { + i--; kunmap(bio->bi_io_vec[i].bv_page); - dev_bytenr += bio->bi_io_vec[i].bv_len; } + kfree(mapped_datav); } else if (NULL != dev_state && (rw & REQ_FLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", rw, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & @@ -2903,6 +3145,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) bio->bi_end_io = btrfsic_bio_end_io; } } +leave: mutex_unlock(&btrfsic_mutex); submit_bio(rw, bio); @@ -2917,6 +3160,30 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; + if (root->nodesize != root->leafsize) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d != leafsize %d!\n", + root->nodesize, root->leafsize); + return -1; + } + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->nodesize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->leafsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } state = kzalloc(sizeof(*state), GFP_NOFS); if (NULL == state) { printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); @@ -2933,6 +3200,8 @@ int btrfsic_mount(struct btrfs_root *root, state->print_mask = print_mask; state->include_extent_data = including_extent_data; state->csum_size = 0; + state->metablock_size = root->nodesize; + state->datablock_size = root->sectorsize; INIT_LIST_HEAD(&state->all_blocks_list); btrfsic_block_hashtable_init(&state->block_hashtable); btrfsic_block_link_hashtable_init(&state->block_link_hashtable); -- cgit v1.2.3 From a25c75d5ad04df0a7abd09585231b4021a91a358 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 18 Apr 2012 09:59:29 +0300 Subject: Btrfs: cleanup: use consistent lock naming It confuses Smatch that we use two names for the same lock. Plus the shorter name is nicer. This doesn't change how the code works, it's just a cleanup. Signed-off-by: Dan Carpenter --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49fd7b66d57..59ae191d4f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3578,7 +3578,7 @@ again: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } -- cgit v1.2.3 From cd1b413c5c863a96bfdeab8e91b1fb3a52665e42 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 22 May 2012 14:56:50 +0200 Subject: Btrfs: ulist realloc bugfix ulist_next gets the pointer to the previously returned element to find the next element from there. However, when we call ulist_add while iteration with ulist_next is in progress (ulist explicitly supports this), we can realloc the ulist internal memory, which makes the pointer to the previous element useless. Instead, we now use an iterator parameter that's independent from the internal pointers. Reported-by: Alexander Block Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 18 +++++++++++++----- fs/btrfs/ulist.c | 23 ++++++++--------------- fs/btrfs/ulist.h | 9 ++++++++- 3 files changed, 29 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bcec0675023..b41d94a6471 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -201,6 +201,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct __prelim_ref *new_ref; struct ulist *parents; struct ulist_node *node; + struct ulist_iterator uiter; parents = ulist_alloc(GFP_NOFS); if (!parents) @@ -225,11 +226,12 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* we put the first parent into the ref at hand */ - node = ulist_next(parents, NULL); + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; /* additional parents require new refs being added here */ - while ((node = ulist_next(parents, node))) { + while ((node = ulist_next(parents, &uiter))) { new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); if (!new_ref) { ret = -ENOMEM; @@ -788,6 +790,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, { struct ulist *tmp; struct ulist_node *node = NULL; + struct ulist_iterator uiter; int ret; tmp = ulist_alloc(GFP_NOFS); @@ -799,6 +802,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return -ENOMEM; } + ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, seq, tmp, *roots); @@ -807,7 +811,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, ulist_free(*roots); return ret; } - node = ulist_next(tmp, node); + node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; @@ -1176,6 +1180,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; struct seq_list seq_elem; + struct ulist_iterator ref_uiter; + struct ulist_iterator root_uiter; struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", @@ -1201,12 +1207,14 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, if (ret) goto out; - while (!ret && (ref_node = ulist_next(refs, ref_node))) { + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, seq_elem.seq, &roots); if (ret) break; - while (!ret && (root_node = ulist_next(roots, root_node))) { + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu\n", root_node->val, ref_node->val); ret = iterate_leaf_refs(fs_info, ref_node->val, diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 12f5147bd2b..17e68bdc307 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -23,9 +23,9 @@ * * ulist = ulist_alloc(); * ulist_add(ulist, root); - * elem = NULL; + * ULIST_ITER_INIT(&uiter); * - * while ((elem = ulist_next(ulist, elem)) { + * while ((elem = ulist_next(ulist, &uiter)) { * for (all child nodes n in elem) * ulist_add(ulist, n); * do something useful with the node; @@ -188,33 +188,26 @@ EXPORT_SYMBOL(ulist_add); /** * ulist_next - iterate ulist * @ulist: ulist to iterate - * @prev: previously returned element or %NULL to start iteration + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * * Note: locking must be provided by the caller. In case of rwlocks only read * locking is needed * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the * end is reached. No guarantee is made with respect to the order in which * the elements are returned. They might neither be returned in order of * addition nor in ascending order. * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { - int next; - if (ulist->nnodes == 0) return NULL; - - if (!prev) - return &ulist->nodes[0]; - - next = (prev - ulist->nodes) + 1; - if (next < 0 || next >= ulist->nnodes) + if (uiter->i < 0 || uiter->i >= ulist->nnodes) return NULL; - return &ulist->nodes[next]; + return &ulist->nodes[uiter->i++]; } EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 2e25dec58ec..62d2574f775 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -24,6 +24,10 @@ */ #define ULIST_SIZE 16 +struct ulist_iterator { + int i; +}; + /* * element of the list */ @@ -63,6 +67,9 @@ struct ulist *ulist_alloc(unsigned long gfp_mask); void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, unsigned long gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); +struct ulist_node *ulist_next(struct ulist *ulist, + struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) #endif -- cgit v1.2.3 From dadcaf78b51e239d93f5ec9aac736de99081ee74 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 22 May 2012 13:43:25 +0200 Subject: Btrfs: bugfix in btrfs_find_parent_nodes That one has been around since the addition of backref.c. Due to the way we calculate our slot numbers, after adding inline refs we're missing one keyed ref unless it's located at the beginning of a new leaf. Reported-by: Alexander Block Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b41d94a6471..c69a846999b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -413,7 +413,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, * enumerate all inline refs */ leaf = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; item_size = btrfs_item_size_nr(leaf, slot); BUG_ON(item_size < sizeof(*ei)); @@ -661,8 +661,9 @@ again: struct extent_buffer *leaf; int slot; + path->slots[0]--; leaf = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY) { -- cgit v1.2.3 From d5c88b735fdf2ef796bb937396dd58dac84e8407 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 15 May 2012 17:55:51 +0200 Subject: Btrfs: bugfix: ignore the wrong key for indirect tree block backrefs The key we store with a tree block backref is only a hint. It is set when the ref is created and can remain correct for a long time. As the tree is rebalanced, however, eventually the key no longer points to the correct destination. With this patch, we change find_parent_nodes to no longer add keys unless it knows for sure they're correct (e.g. because they're for an extent data backref). Then when we later encounter a backref ref with no parent and no key set, we grab the block and take the first key from the block itself. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 185 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 135 insertions(+), 50 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c69a846999b..366978c5cdd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -30,16 +30,55 @@ struct __prelim_ref { struct list_head list; u64 root_id; - struct btrfs_key key; + struct btrfs_key key_for_search; int level; int count; u64 parent; u64 wanted_disk_byte; }; +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see __add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ + static int __add_prelim_ref(struct list_head *head, u64 root_id, - struct btrfs_key *key, int level, u64 parent, - u64 wanted_disk_byte, int count) + struct btrfs_key *key, int level, + u64 parent, u64 wanted_disk_byte, int count) { struct __prelim_ref *ref; @@ -50,9 +89,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, ref->root_id = root_id; if (key) - ref->key = *key; + ref->key_for_search = *key; else - memset(&ref->key, 0, sizeof(ref->key)); + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); ref->level = level; ref->count = count; @@ -152,12 +191,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key.objectid, ref->key.type, - (unsigned long long)ref->key.offset); + (unsigned long long)ref->key_for_search.objectid, + ref->key_for_search.type, + (unsigned long long)ref->key_for_search.offset); if (ret < 0) goto out; @@ -248,10 +288,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, return ret; } +static inline int ref_for_same_block(struct __prelim_ref *ref1, + struct __prelim_ref *ref2) +{ + if (ref1->level != ref2->level) + return 0; + if (ref1->root_id != ref2->root_id) + return 0; + if (ref1->key_for_search.type != ref2->key_for_search.type) + return 0; + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) + return 0; + if (ref1->key_for_search.offset != ref2->key_for_search.offset) + return 0; + if (ref1->parent != ref2->parent) + return 0; + + return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + struct list_head *pos; + struct extent_buffer *eb; + + list_for_each(pos, head) { + struct __prelim_ref *ref; + ref = list_entry(pos, struct __prelim_ref, list); + + if (ref->parent) + continue; + if (ref->key_for_search.type) + continue; + BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, + fs_info->tree_root->leafsize, 0); + BUG_ON(!eb); + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return 0; +} + /* * merge two lists of backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + * additionally, we could even add a key range for the blocks we + * looked into to merge even more (-> replace unresolved refs by those + * having a parent). * mode = 2: merge identical parents */ static int __merge_refs(struct list_head *head, int mode) @@ -265,20 +360,21 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = list_entry(pos1, struct __prelim_ref, list); - if (mode == 1 && ref1->key.type == 0) - continue; for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; + struct __prelim_ref *xchg; ref2 = list_entry(pos2, struct __prelim_ref, list); if (mode == 1) { - if (memcmp(&ref1->key, &ref2->key, - sizeof(ref1->key)) || - ref1->level != ref2->level || - ref1->root_id != ref2->root_id) + if (!ref_for_same_block(ref1, ref2)) continue; + if (!ref1->parent && ref2->parent) { + xchg = ref1; + ref1 = ref2; + ref2 = xchg; + } ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) @@ -298,16 +394,17 @@ static int __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct btrfs_key *info_key, struct list_head *prefs) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; + struct btrfs_key key; + struct btrfs_key op_key = {0}; int sgn; int ret = 0; if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(info_key, &extent_op->key); + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); while ((n = rb_prev(n))) { struct btrfs_delayed_ref_node *node; @@ -339,7 +436,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, + ret = __add_prelim_ref(prefs, ref->root, &op_key, ref->level + 1, 0, node->bytenr, node->ref_mod * sgn); break; @@ -348,7 +445,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, + ret = __add_prelim_ref(prefs, ref->root, NULL, ref->level + 1, ref->parent, node->bytenr, node->ref_mod * sgn); @@ -356,8 +453,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, } case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - ref = btrfs_delayed_node_to_data_ref(node); key.objectid = ref->objectid; @@ -370,7 +465,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, } case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; ref = btrfs_delayed_node_to_data_ref(node); @@ -396,8 +490,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, */ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int *info_level, - struct list_head *prefs) + int *info_level, struct list_head *prefs) { int ret = 0; int slot; @@ -426,12 +519,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; - struct btrfs_disk_key disk_key; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); - btrfs_tree_block_key(leaf, info, &disk_key); - btrfs_disk_key_to_cpu(info_key, &disk_key); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); } else { @@ -449,7 +539,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, + ret = __add_prelim_ref(prefs, 0, NULL, *info_level + 1, offset, bytenr, 1); break; @@ -464,8 +554,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, offset, info_key, - *info_level + 1, 0, bytenr, 1); + ret = __add_prelim_ref(prefs, offset, NULL, + *info_level + 1, 0, + bytenr, 1); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -479,8 +570,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, - count); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); break; } default: @@ -498,8 +589,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, */ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int info_level, - struct list_head *prefs) + int info_level, struct list_head *prefs) { struct btrfs_root *extent_root = fs_info->extent_root; int ret; @@ -529,7 +619,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, switch (key.type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, + ret = __add_prelim_ref(prefs, 0, NULL, info_level + 1, key.offset, bytenr, 1); break; @@ -545,8 +635,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, key.offset, info_key, - info_level + 1, 0, bytenr, 1); + ret = __add_prelim_ref(prefs, key.offset, NULL, + info_level + 1, 0, + bytenr, 1); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -562,7 +653,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count); break; } default: @@ -588,7 +679,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, { struct btrfs_key key; struct btrfs_path *path; - struct btrfs_key info_key = { 0 }; struct btrfs_delayed_ref_root *delayed_refs = NULL; struct btrfs_delayed_ref_head *head; int info_level = 0; @@ -647,8 +737,7 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, seq, &info_key, - &prefs_delayed); + ret = __add_delayed_refs(head, seq, &prefs_delayed); if (ret) { spin_unlock(&delayed_refs->lock); goto out; @@ -668,10 +757,10 @@ again: if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY) { ret = __add_inline_refs(fs_info, path, bytenr, - &info_key, &info_level, &prefs); + &info_level, &prefs); if (ret) goto out; - ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, + ret = __add_keyed_refs(fs_info, path, bytenr, info_level, &prefs); if (ret) goto out; @@ -679,16 +768,12 @@ again: } btrfs_release_path(path); - /* - * when adding the delayed refs above, the info_key might not have - * been known yet. Go over the list and replace the missing keys - */ - list_for_each_entry(ref, &prefs_delayed, list) { - if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) - memcpy(&ref->key, &info_key, sizeof(ref->key)); - } list_splice_init(&prefs_delayed, &prefs); + ret = __add_missing_keys(fs_info, &prefs); + if (ret) + goto out; + ret = __merge_refs(&prefs, 1); if (ret) goto out; -- cgit v1.2.3 From 976b1908d97bd8cbd024ba7aafaa3fb637ea8e13 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 17 May 2012 16:43:03 +0200 Subject: Btrfs: look into the extent during find_all_leafs Before this patch we called find_all_leafs for a data extent, then called find_all_roots and then looked into the extent to grab the information we were seeking. This was done without holding the leaves locked to avoid deadlocks. However, this can obviouly race with concurrent tree modifications. Instead, we now look into the extent while we're holding the lock during find_all_leafs and store this information together with the leaf list. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 240 +++++++++++++++++++++++++++++++++++------------------ fs/btrfs/backref.h | 2 +- 2 files changed, 158 insertions(+), 84 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 366978c5cdd..fd13101aafa 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -24,6 +24,79 @@ #include "delayed-ref.h" #include "locking.h" +struct extent_inode_elem { + u64 inum; + u64 offset; + struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, + struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 data_offset; + u64 data_len; + struct extent_inode_elem *e; + + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = key->offset + (extent_item_pos - data_offset); + *eie = e; + + return 0; +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != wanted_disk_byte) + continue; + + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + if (ret < 0) + return ret; + } + + return 0; +} + /* * this structure records all encountered refs on the way up to the root */ @@ -103,15 +176,16 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, - struct extent_buffer *eb, int level, - u64 wanted_objectid, u64 wanted_disk_byte) + struct ulist *parents, int level, + struct btrfs_key *key, u64 wanted_disk_byte, + const u64 *extent_item_pos) { int ret; int slot; + struct extent_buffer *eb = path->nodes[level]; struct btrfs_file_extent_item *fi; - struct btrfs_key key; u64 disk_byte; + u64 wanted_objectid = key->objectid; add_parent: ret = ulist_add(parents, eb->start, 0, GFP_NOFS); @@ -136,9 +210,9 @@ add_parent: eb = path->nodes[0]; for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != wanted_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) + btrfs_item_key_to_cpu(eb, key, slot); + if (key->objectid != wanted_objectid || + key->type != BTRFS_EXTENT_DATA_KEY) return 0; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -158,7 +232,8 @@ add_parent: static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int search_commit_root, struct __prelim_ref *ref, - struct ulist *parents) + struct ulist *parents, + const u64 *extent_item_pos) { struct btrfs_path *path; struct btrfs_root *root; @@ -219,9 +294,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, btrfs_item_key_to_cpu(eb, &key, path->slots[0]); } - /* the last two parameters will only be used for level == 0 */ - ret = add_all_parents(root, path, parents, eb, level, key.objectid, - ref->wanted_disk_byte); + ret = add_all_parents(root, path, parents, level, &key, + ref->wanted_disk_byte, extent_item_pos); out: btrfs_free_path(path); return ret; @@ -232,7 +306,8 @@ out: */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, int search_commit_root, - struct list_head *head) + struct list_head *head, + const u64 *extent_item_pos) { int err; int ret = 0; @@ -258,7 +333,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, search_commit_root, - ref, parents); + ref, parents, extent_item_pos); if (err) { if (ret == 0) ret = err; @@ -675,7 +750,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist *refs, struct ulist *roots) + u64 seq, struct ulist *refs, struct ulist *roots, + const u64 *extent_item_pos) { struct btrfs_key key; struct btrfs_path *path; @@ -778,7 +854,8 @@ again: if (ret) goto out; - ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); + ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs, + extent_item_pos); if (ret) goto out; @@ -797,7 +874,21 @@ again: BUG_ON(ret < 0); } if (ref->count && ref->parent) { - ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); + struct extent_inode_elem *eie = NULL; + if (extent_item_pos) { + u32 bsz; + struct extent_buffer *eb; + bsz = btrfs_level_size(fs_info->extent_root, + info_level); + eb = read_tree_block(fs_info->extent_root, + ref->parent, bsz, 0); + BUG_ON(!eb); + ret = find_extent_in_eb(eb, bytenr, + *extent_item_pos, &eie); + free_extent_buffer(eb); + } + ret = ulist_add(refs, ref->parent, + (unsigned long)eie, GFP_NOFS); BUG_ON(ret < 0); } kfree(ref); @@ -822,6 +913,28 @@ out: return ret; } +static void free_leaf_list(struct ulist *blocks) +{ + struct ulist_node *node = NULL; + struct extent_inode_elem *eie; + struct extent_inode_elem *eie_next; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(blocks, &uiter))) { + if (!node->aux) + continue; + eie = (struct extent_inode_elem *)node->aux; + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } + node->aux = 0; + } + + ulist_free(blocks); +} + /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -832,7 +945,8 @@ out: */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **leafs) + u64 seq, struct ulist **leafs, + const u64 *extent_item_pos) { struct ulist *tmp; int ret; @@ -846,11 +960,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); + ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp, + extent_item_pos); ulist_free(tmp); if (ret < 0 && ret != -ENOENT) { - ulist_free(*leafs); + free_leaf_list(*leafs); return ret; } @@ -872,7 +987,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots) + u64 seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -891,7 +1006,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, seq, - tmp, *roots); + tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1183,67 +1298,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, - u64 orig_extent_item_objectid, - u64 extent_item_pos, u64 root, +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, iterate_extent_inodes_t *iterate, void *ctx) { - u64 disk_byte; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *eb; - int slot; - int nritems; + struct extent_inode_elem *eie; int ret = 0; - int extent_type; - u64 data_offset; - u64 data_len; - - eb = read_tree_block(fs_info->tree_root, logical, - fs_info->tree_root->leafsize, 0); - if (!eb) - return -EIO; - - /* - * from the shared data ref, we only have the leaf but we need - * the key. thus, we must look into all items and see that we - * find one (some) with a reference to our extent item. - */ - nritems = btrfs_header_nritems(eb); - for (slot = 0; slot < nritems; ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) - continue; - - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - continue; + for (eie = inode_list; eie; eie = eie->next) { pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " - "root %llu\n", orig_extent_item_objectid, - key.objectid, key.offset, root); - ret = iterate(key.objectid, - key.offset + (extent_item_pos - data_offset), - root, ctx); + "root %llu\n", extent_item_objectid, + eie->inum, eie->offset, root); + ret = iterate(eie->inum, eie->offset, root, ctx); if (ret) { - pr_debug("stopping iteration because ret=%d\n", ret); + pr_debug("stopping iteration for %llu due to ret=%d\n", + extent_item_objectid, ret); break; } } - free_extent_buffer(eb); - return ret; } @@ -1287,30 +1360,31 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - extent_item_pos, seq_elem.seq, - &refs); - + seq_elem.seq, &refs, &extent_item_pos); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(roots, &root_uiter))) { - pr_debug("root %llu references leaf %llu\n", - root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, ref_node->val, - extent_item_objectid, - extent_item_pos, root_node->val, - iterate, ctx); + pr_debug("root %llu references leaf %llu, data list " + "%#lx\n", root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs( + (struct extent_inode_elem *)ref_node->aux, + root_node->val, extent_item_objectid, + iterate, ctx); } + ulist_free(roots); + roots = NULL; } - ulist_free(refs); + free_leaf_list(refs); ulist_free(roots); out: if (!search_commit_root) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 57ea2e959e4..94ba1b2e733 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,7 +58,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots); + u64 seq, struct ulist **roots); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, -- cgit v1.2.3 From 5581a51a59a1f5f51ac3d4bacafb738d35e0350b Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 17:04:52 +0200 Subject: Btrfs: don't set for_cow parameter for tree block functions Three callers of btrfs_free_tree_block or btrfs_alloc_tree_block passed parameter for_cow = 1. In fact, these two functions should never mark their tree modification operations as for_cow, because they can change the number of blocks referenced by a tree. Hence, we remove the extra for_cow parameter from these functions and make them pass a zero down. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 22 +++++++++++----------- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 10 +++++----- fs/btrfs/ioctl.c | 2 +- 5 files changed, 20 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4106264fbc6..56485b3b7c3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -255,7 +255,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0, 1); + buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -467,7 +467,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size, 1); + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -509,7 +509,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -525,7 +525,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -987,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; @@ -1042,7 +1042,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(right); del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1, 0); + btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); right = NULL; } else { @@ -1084,7 +1084,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; } else { @@ -2129,7 +2129,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0, 0); + level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2252,7 +2252,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -3004,7 +3004,7 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, 0); + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3804,7 +3804,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); extent_buffer_get(leaf); - btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + btrfs_free_tree_block(trans, root, leaf, 0, 1); free_extent_buffer_stale(leaf); } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ec42a24e935..e863188a3f1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2496,11 +2496,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow); + u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow); + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7ffc88a7db..f4330749251 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0, 0); + 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49fd7b66d57..b68eb7ad05a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5217,7 +5217,7 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow) + u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; int ret; @@ -5227,7 +5227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, 0); BUG_ON(ret); /* -ENOMEM */ } @@ -6249,7 +6249,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow) + u64 hint, u64 empty_size) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6297,7 +6297,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, for_cow); + extent_op, 0); BUG_ON(ret); /* -ENOMEM */ } return buf; @@ -6715,7 +6715,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14f8e1faa46..7f3a91367d7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; -- cgit v1.2.3 From 64947ec0d16dd20d6542b58cf82c8d5f9678cabf Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 16:57:09 +0200 Subject: Btrfs: move struct seq_list to ctree.h Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.h | 7 +++++++ fs/btrfs/delayed-ref.h | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e863188a3f1..e0da6dbeb57 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3098,4 +3098,11 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; + u32 flags; +}; + #endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d8f244d9492..fd824467021 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); -struct seq_list { - struct list_head list; - u64 seq; -}; - static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) { assert_spin_locked(&delayed_refs->lock); -- cgit v1.2.3 From 815a51c74ad14864d0a8fff5eea983819c18feae Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 17:00:02 +0200 Subject: Btrfs: dummy extent buffers for tree mod log The tree modification log needs two ways to create dummy extent buffers, once by allocating a fresh one (to rebuild an old root) and once by cloning an existing one (to make private rewind modifications) to it. Signed-off-by: Jan Schmidt --- fs/btrfs/extent_io.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/extent_io.h | 3 ++ 2 files changed, 76 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2fb52c26c67..3daed70a401 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3930,6 +3930,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, eb->start = start; eb->len = len; eb->tree = tree; + eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3967,6 +3968,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return eb; } +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +{ + unsigned long i; + struct page *p; + struct extent_buffer *new; + unsigned long num_pages = num_extent_pages(src->start, src->len); + + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); + if (new == NULL) + return NULL; + + for (i = 0; i < num_pages; i++) { + p = alloc_page(GFP_ATOMIC); + BUG_ON(!p); + attach_extent_buffer_page(new, p); + WARN_ON(PageDirty(p)); + SetPageUptodate(p); + new->pages[i] = p; + } + + copy_extent_buffer(new, src, 0, 0, src->len); + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + + return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +{ + struct extent_buffer *eb; + unsigned long num_pages = num_extent_pages(0, len); + unsigned long i; + + eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); + if (!eb) + return NULL; + + for (i = 0; i < num_pages; i++) { + eb->pages[i] = alloc_page(GFP_ATOMIC); + if (!eb->pages[i]) + goto err; + } + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + return eb; +err: + for (i--; i > 0; i--) + __free_page(eb->pages[i]); + __free_extent_buffer(eb); + return NULL; +} + static int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || @@ -3982,6 +4037,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, { unsigned long index; struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); BUG_ON(extent_buffer_under_io(eb)); @@ -3992,7 +4048,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) { + if (page && mapped) { spin_lock(&page->mapping->private_lock); /* * We do this since we'll remove the pages after we've @@ -4017,6 +4073,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, } spin_unlock(&page->mapping->private_lock); + } + if (page) { /* One for when we alloced the page */ page_cache_release(page); } @@ -4235,14 +4293,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - struct extent_io_tree *tree = eb->tree; + if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + } else { + struct extent_io_tree *tree = eb->tree; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, - eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + } /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb, 0); @@ -4259,6 +4321,10 @@ void free_extent_buffer(struct extent_buffer *eb) return; spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + atomic_dec(&eb->refs); + if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b516c3b8dec..96434a61d7c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,6 +39,7 @@ #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_DUMMY 9 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -265,6 +266,8 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); -- cgit v1.2.3 From f29021b29a85701c08afadfd51d87163fb078059 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 17:55:38 +0200 Subject: Btrfs: add tree mod log to fs_info Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/disk-io.c | 5 +++++ 2 files changed, 14 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e0da6dbeb57..01639e10004 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1129,6 +1129,15 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + /* this protects tree_mod_seq_list */ + spinlock_t tree_mod_seq_lock; + atomic_t tree_mod_seq; + struct list_head tree_mod_seq_list; + + /* this protects tree_mod_log */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f4330749251..f51ad8477f1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv); btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); @@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; fs_info->free_chunk_space = 0; + fs_info->tree_mod_log = RB_ROOT; /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); -- cgit v1.2.3 From b0b0382bb4904965a9e9fca77ad87514dfda0d1c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 2 Apr 2012 14:34:06 -0400 Subject: ->encode_fh() API change pass inode + parent's inode or NULL instead of dentry + bool saying whether we want the parent or not. NOTE: that needs ceph fix folded in. Signed-off-by: Al Viro --- fs/btrfs/export.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e887ee62b6d..614f34a899c 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -13,15 +13,14 @@ parent_root_objectid) / 4) #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) -static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, - int connectable) +static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { struct btrfs_fid *fid = (struct btrfs_fid *)fh; - struct inode *inode = dentry->d_inode; int len = *max_len; int type; - if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { @@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, fid->root_objectid = BTRFS_I(inode)->root->objectid; fid->gen = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; + if (parent) { u64 parent_root_id; - spin_lock(&dentry->d_lock); - - parent = dentry->d_parent->d_inode; fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; parent_root_id = BTRFS_I(parent)->root->objectid; - spin_unlock(&dentry->d_lock); - if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; -- cgit v1.2.3 From 528c032764f4d3c6cb5f5ece090d9d5882655982 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 11:03:55 -0400 Subject: btrfs: trivial endianness annotations Signed-off-by: Al Viro --- fs/btrfs/free-space-cache.c | 7 ++++--- fs/btrfs/ulist.c | 4 ++-- fs/btrfs/ulist.h | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 202008ec367..eb453506fac 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -75,7 +75,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, return ERR_PTR(-ENOENT); } - inode->i_mapping->flags &= ~__GFP_FS; + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); return inode; } @@ -365,7 +366,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) { - u64 *val; + __le64 *val; io_ctl_map_page(io_ctl, 1); @@ -388,7 +389,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) { - u64 *gen; + __le64 *gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 12f5147bd2b..ad993bc2df9 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit); * * The allocated ulist will be returned in an initialized state. */ -struct ulist *ulist_alloc(unsigned long gfp_mask) +struct ulist *ulist_alloc(gfp_t gfp_mask) { struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); @@ -144,7 +144,7 @@ EXPORT_SYMBOL(ulist_free); * unaltered. */ int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask) + gfp_t gfp_mask) { int i; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 2e25dec58ec..ad85b0e60af 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -59,10 +59,10 @@ struct ulist { void ulist_init(struct ulist *ulist); void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); +struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask); + gfp_t gfp_mask); struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); #endif -- cgit v1.2.3 From bd989ba359f2acb8bc5f5490e19010fc0a6f8356 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 17:18:50 +0200 Subject: Btrfs: add tree modification log functions The tree mod log will log modifications made fs-tree nodes. Most modifications are done by autobalance of the tree. Such changes are recorded as long as a block entry exists. When released, the log is cleaned. With the tree modification log, it's possible to reconstruct a consistent old state of the tree. This is required to do backref walking on a busy file system. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 408 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/ctree.h | 5 + 2 files changed, 412 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 56485b3b7c3..72b9f97e2fd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -288,6 +289,412 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + struct seq_list elem; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + elem->flags = 1; + spin_lock(&fs_info->tree_mod_seq_lock); + __get_tree_mod_seq(fs_info, elem); + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + BUG_ON(!(elem->flags & 1)); + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + goto out; + } + min_seq = cur_elem->seq; + } + } + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->elem.seq > min_seq) + continue; + rb_erase(node, tm_root); + list_del(&tm->elem.list); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +out: + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + int ret = 0; + + BUG_ON(!tm || !tm->elem.seq); + + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->elem.seq < tm->elem.seq) + new = &((*new)->rb_left); + else if (cur->elem.seq > tm->elem.seq) + new = &((*new)->rb_right); + else { + kfree(tm); + ret = -EEXIST; + goto unlock; + } + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); +unlock: + write_unlock(&fs_info->tree_mod_log_lock); + return ret; +} + +int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, + struct tree_mod_elem **tm_ret) +{ + struct tree_mod_elem *tm; + u64 seq = 0; + + smp_mb(); + if (list_empty(&fs_info->tree_mod_seq_list)) + return 0; + + tm = *tm_ret = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; + + __get_tree_mod_seq(fs_info, &tm->elem); + seq = tm->elem.seq; + tm->elem.flags = 0; + + return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + int slot, enum mod_log_op op) +{ + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + int i; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING); + BUG_ON(ret < 0); + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + return __tree_mod_log_insert(fs_info, tm); +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->elem.seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->elem.seq > cur->elem.seq); + found = cur; + node = node->rb_left; + } else if (cur->elem.seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->elem.seq < cur->elem.seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + struct extent_buffer *src, unsigned long dst_offset, + unsigned long src_offset, int nr_items) +{ + int ret; + int i; + + smp_mb(); + if (list_empty(&fs_info->tree_mod_seq_list)) + return; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return; + + /* speed this up by single seq for all operations? */ + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items) +{ + int ret; + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, + nr_items, GFP_NOFS); + BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int slot, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + int i; + int ret; + u32 nritems; + + smp_mb(); + if (list_empty(&fs_info->tree_mod_seq_list)) + return; + + if (btrfs_header_level(eb) == 0) + return; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, + struct extent_buffer *new_root_node) +{ + int ret; + tree_mod_log_free_eb(root->fs_info, root->node); + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS); + BUG_ON(ret < 0); +} + /* * check if the tree block can be shared by multiple trees */ @@ -2271,7 +2678,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 01639e10004..e53bfb9b391 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3114,4 +3114,9 @@ struct seq_list { u32 flags; }; +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); + #endif -- cgit v1.2.3 From f230475e62f77930e776881deb6e95cfd2226bd4 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sat, 26 May 2012 11:43:17 +0200 Subject: Btrfs: put all block modifications into the tree mod log When running functions that can make changes to the internal trees (e.g. btrfs_search_slot), we check if somebody may be interested in the block we're currently modifying. If so, we record our modification to be able to rewind it later on. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 72b9f97e2fd..07c1a96aa4a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,6 +39,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid, + u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 time_seq); struct btrfs_path *btrfs_alloc_path(void) { @@ -816,6 +824,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } + /* + * don't log freeing in case we're freeing the root node, this + * is done by tree_mod_log_set_root_pointer later + */ + if (buf != root->node && btrfs_header_level(buf) != 0) + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -913,6 +927,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; extent_buffer_get(cow); + tree_mod_log_set_root_pointer(root, cow); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, @@ -926,6 +941,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; WARN_ON(trans->transid != btrfs_header_generation(parent)); + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -1381,6 +1398,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1455,6 +1473,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &right_key, pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1498,6 +1518,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1595,6 +1617,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1646,6 +1670,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2348,6 +2374,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; + tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -2430,12 +2457,16 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { + tree_mod_log_eb_move(root->fs_info, src, 0, push_items, + src_nritems - push_items); memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * @@ -2489,11 +2520,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); + tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2568,6 +2602,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; + tree_mod_log_set_root_pointer(root, c); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -2678,6 +2713,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), -- cgit v1.2.3 From f3ea38da3e76455fbd6d405cdca4d050ed085458 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sat, 26 May 2012 11:45:21 +0200 Subject: Btrfs: add del_ptr and insert_ptr modifications to the tree mod log Record all relevant modifications to block pointers in the tree mod log so that we can rewind them later on for backref walking. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 07c1a96aa4a..69ce3f7deee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,7 +38,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); + struct btrfs_path *path, int level, int slot, + int tree_mod_log); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, @@ -1465,7 +1466,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1); + del_ptr(trans, root, path, level + 1, pslot + 1, 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1509,7 +1510,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot); + del_ptr(trans, root, path, level + 1, pslot, 1); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -2626,10 +2627,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, static void insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) + int slot, int level, int tree_mod_log) { struct extent_buffer *lower; int nritems; + int ret; BUG_ON(!path->nodes[level]); btrfs_assert_tree_locked(path->nodes[level]); @@ -2638,11 +2640,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(slot > nritems); BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + slot, nritems - slot); memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); WARN_ON(trans->transid == 0); @@ -2726,7 +2736,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(split); insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + path->slots[level + 1] + 1, level + 1, 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -3263,7 +3273,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + path->slots[1] + 1, 1, 0); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3470,7 +3480,7 @@ again: if (mid <= slot) { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + path->slots[1] + 1, 1, 0); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3479,7 +3489,7 @@ again: } else { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1); + path->slots[1], 1, 0); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -4191,19 +4201,31 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) + struct btrfs_path *path, int level, int slot, + int tree_mod_log) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; + int ret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, parent, slot, + slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } + + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } + nritems--; btrfs_set_header_nritems(parent, nritems); if (nritems == 0 && parent == root->node) { @@ -4235,7 +4257,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1]); + del_ptr(trans, root, path, 1, path->slots[1], 1); /* * btrfs_free_extent is expensive, we want to make sure we -- cgit v1.2.3 From 5d9e75c41d11ca79b1d1ff6ed17c88c9047d1fea Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 18:25:47 +0200 Subject: Btrfs: add btrfs_search_old_slot The tree modification log together with the current state of the tree gives a consistent, old version of the tree. btrfs_search_old_slot is used to search through this old version and return old (dummy!) extent buffers. Naturally, this function cannot do any tree modifications. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 319 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/ctree.h | 2 + 2 files changed, 317 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 69ce3f7deee..0954f1770fd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -960,6 +960,207 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = root->node->start; + int looped = 0; + + if (!time_seq) + return 0; + + /* + * the very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). this has the index of the *new* + * root, making it the very first operation that's logged for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(fs_info, root_logical, + time_seq); + if (!looped && !tm) + return 0; + /* + * we must have key remove operations in the log before the + * replace operation. + */ + BUG_ON(!tm); + + if (tm->op != MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + BUG_ON(root_logical == root->node->start); + looped = 1; + } + + return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + while (tm && tm->elem.seq >= time_seq) { + /* + * all the operations are recorded with the operator used for + * the modification. as we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case MOD_LOG_KEY_ADD: + if (tm->slot != n - 1) { + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->slot + 1); + memmove_extent_buffer(eb, o_dst, o_src, p_size); + } + n--; + break; + case MOD_LOG_MOVE_KEYS: + memmove_extent_buffer(eb, tm->slot, tm->move.dst_slot, + tm->move.nr_items * p_size); + break; + case MOD_LOG_ROOT_REPLACE: + /* + * this operation is special. for roots, this must be + * handled explicitly before rewinding. + * for non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. in the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. we simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = container_of(next, struct tree_mod_elem, node); + if (tm->index != first_tm->index) + break; + } + btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(eb->start, + fs_info->tree_root->nodesize); + BUG_ON(!eb_rewin); + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + BUG_ON(!eb_rewin); + } + + extent_buffer_get(eb_rewin); + free_extent_buffer(eb); + + __tree_mod_log_rewind(eb_rewin, time_seq, tm); + + return eb_rewin; +} + +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct extent_buffer *eb; + struct tree_mod_root *old_root; + u64 old_generation; + + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (!tm) + return root->node; + + old_root = &tm->old_root; + old_generation = tm->generation; + + tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); + /* + * there was an item in the log when __tree_mod_log_oldest_root + * returned. this one must not go away, because the time_seq passed to + * us must be blocking its removal. + */ + BUG_ON(!tm); + + if (old_root->logical == root->node->start) { + /* there are logged operations for the current root */ + eb = btrfs_clone_extent_buffer(root->node); + } else { + /* there's a root replace operation for the current root */ + eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, + root->nodesize); + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, root->root_key.objectid); + } + if (!eb) + return NULL; + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + __tree_mod_log_rewind(eb, time_seq, tm); + + return eb; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -1930,7 +2131,7 @@ static int read_block_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key) + struct btrfs_key *key, u64 time_seq) { u64 blocknr; u64 gen; @@ -2284,7 +2485,7 @@ cow_done: } err = read_block_for_search(trans, root, p, - &b, level, slot, key); + &b, level, slot, key, 0); if (err == -EAGAIN) goto again; if (err) { @@ -2355,6 +2556,116 @@ done: return ret; } +/* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + level = 0; + b = get_old_root(root, time_seq); + extent_buffer_get(b); + level = btrfs_header_level(b); + btrfs_tree_read_lock(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + level = btrfs_header_level(b); + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + ret = bin_search(b, key, level, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(NULL, root, p, &b, level, + slot, key, time_seq); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + b = tree_mod_log_rewind(root->fs_info, b, time_seq); + if (b != p->nodes[level]) { + btrfs_tree_unlock_rw(p->nodes[level], + p->locks[level]); + p->locks[level] = 0; + p->nodes[level] = b; + } + } else { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + } + ret = 1; +done: + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -4735,7 +5046,7 @@ again: next = c; next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key); + slot, &key, 0); if (ret == -EAGAIN) goto again; @@ -4772,7 +5083,7 @@ again: break; ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key); + 0, &key, 0); if (ret == -EAGAIN) goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e53bfb9b391..6ba21b12cec 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2668,6 +2668,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, -- cgit v1.2.3 From 8445f61cad927b6efffdd4e042a51a783ff8853f Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 16 May 2012 18:36:03 +0200 Subject: Btrfs: use the tree modification log for backref resolving This enables backref resolving on life trees while they are changing. This is a prerequisite for quota groups and just nice to have for everything else. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 43 +++++++++++++++++++++++++++---------------- fs/btrfs/backref.h | 3 ++- 2 files changed, 29 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index fd13101aafa..0ac47f2834d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -231,6 +231,7 @@ add_parent: */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int search_commit_root, + u64 time_seq, struct __prelim_ref *ref, struct ulist *parents, const u64 *extent_item_pos) @@ -266,7 +267,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", (unsigned long long)ref->root_id, level, ref->count, ret, @@ -305,7 +306,7 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, + int search_commit_root, u64 time_seq, struct list_head *head, const u64 *extent_item_pos) { @@ -333,7 +334,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, search_commit_root, - ref, parents, extent_item_pos); + time_seq, ref, parents, + extent_item_pos); if (err) { if (ret == 0) ret = err; @@ -750,7 +752,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist *refs, struct ulist *roots, + u64 delayed_ref_seq, u64 time_seq, + struct ulist *refs, struct ulist *roots, const u64 *extent_item_pos) { struct btrfs_key key; @@ -813,7 +816,8 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, seq, &prefs_delayed); + ret = __add_delayed_refs(head, delayed_ref_seq, + &prefs_delayed); if (ret) { spin_unlock(&delayed_refs->lock); goto out; @@ -854,8 +858,8 @@ again: if (ret) goto out; - ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs, - extent_item_pos); + ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, + &prefs, extent_item_pos); if (ret) goto out; @@ -945,7 +949,8 @@ static void free_leaf_list(struct ulist *blocks) */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist **leafs, + u64 delayed_ref_seq, u64 time_seq, + struct ulist **leafs, const u64 *extent_item_pos) { struct ulist *tmp; @@ -960,8 +965,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp, - extent_item_pos); + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, *leafs, tmp, extent_item_pos); ulist_free(tmp); if (ret < 0 && ret != -ENOENT) { @@ -987,7 +992,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist **roots) + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1005,8 +1011,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, seq, - tmp, *roots, NULL); + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1338,7 +1344,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list seq_elem; + struct seq_list seq_elem = {}; + struct seq_list tree_mod_seq_elem = {}; struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1357,17 +1364,20 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); btrfs_get_delayed_seq(delayed_refs, &seq_elem); spin_unlock(&delayed_refs->lock); + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, &extent_item_pos); + seq_elem.seq, tree_mod_seq_elem.seq, &refs, + &extent_item_pos); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - seq_elem.seq, &roots); + seq_elem.seq, + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1388,6 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ulist_free(roots); out: if (!search_commit_root) { + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_put_delayed_seq(delayed_refs, &seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 94ba1b2e733..c18d8ac7b79 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist **roots); + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, -- cgit v1.2.3 From 19ae4e8133f370d820c4cdd61a4b703235664a5f Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 20 May 2012 15:42:19 +0200 Subject: Btrfs: fs_info variable for join_transaction Signed-off-by: Jan Schmidt --- fs/btrfs/transaction.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 36422254ef6..eb2bd826bb0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -55,48 +55,49 @@ static noinline void switch_commit_root(struct btrfs_root *root) static noinline int join_transaction(struct btrfs_root *root, int nofail) { struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *fs_info = root->fs_info; - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - spin_unlock(&root->fs_info->trans_lock); + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&fs_info->trans_lock); return -EROFS; } - if (root->fs_info->trans_no_join) { + if (fs_info->trans_no_join) { if (!nofail) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return -EBUSY; } } - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; if (cur_trans) { if (cur_trans->aborted) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return 0; } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; - spin_lock(&root->fs_info->trans_lock); - if (root->fs_info->running_transaction) { + spin_lock(&fs_info->trans_lock); + if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the trans_no_join checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; goto loop; } @@ -127,14 +128,14 @@ loop: INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping); - root->fs_info->generation++; - cur_trans->transid = root->fs_info->generation; - root->fs_info->running_transaction = cur_trans; + fs_info->btree_inode->i_mapping); + fs_info->generation++; + cur_trans->transid = fs_info->generation; + fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return 0; } -- cgit v1.2.3 From 20b297d620cd1bb94127942bbb3702fb7b1030b2 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 20 May 2012 15:43:53 +0200 Subject: Btrfs: tree mod log sanity checks in join_transaction When a fresh transaction begins, the tree mod log must be clean. Users of the tree modification log must ensure they never span across transaction boundaries. We reset the sequence to 0 in this safe situation to make absolutely sure overflow can't happen. Signed-off-by: Jan Schmidt --- fs/btrfs/transaction.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index eb2bd826bb0..667735fb45e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -122,6 +122,24 @@ loop: cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; cur_trans->delayed_refs.seq = 1; + + /* + * although the tree mod log is per file system and not per transaction, + * the log must never go across transaction boundaries. + */ + smp_mb(); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " + "creating a fresh transaction\n"); + WARN_ON(1); + } + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { + printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + "creating a fresh transaction\n"); + WARN_ON(1); + } + atomic_set(&fs_info->tree_mod_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); -- cgit v1.2.3 From 0c4d2d95d06e920e0c61707e62c7fffc9c57f63a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Apr 2012 15:03:02 -0400 Subject: Btrfs: use i_version instead of our own sequence We've been keeping around the inode sequence number in hopes that somebody would use it, but nobody uses it and people actually use i_version which serves the same purpose, so use i_version where we used the incore inode's sequence number and that way the sequence is updated properly across the board, and not just in file write. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 3 --- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/file.c | 1 - fs/btrfs/inode.c | 16 +++++++++++++--- fs/btrfs/ioctl.c | 2 ++ fs/btrfs/super.c | 2 +- fs/btrfs/xattr.c | 1 + 7 files changed, 19 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9b9b15fd520..3771b8543a7 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -83,9 +83,6 @@ struct btrfs_inode { */ u64 generation; - /* sequence number for NFS changes */ - u64 sequence; - /* * transid of the trans_handle that last modified this inode */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 03e3748d84d..bcd40c7109f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); btrfs_set_stack_inode_generation(inode_item, BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); + btrfs_set_stack_inode_sequence(inode_item, inode->i_version); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53bf2d764bb..8aa8d7fe74d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1409,7 +1409,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, mutex_unlock(&inode->i_mutex); goto out; } - BTRFS_I(inode)->sequence++; start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 61b16c641ce..41a62e6954c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2510,7 +2510,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -2594,7 +2594,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_sequence(leaf, item, inode->i_version); btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); @@ -2752,6 +2752,8 @@ err: goto out; btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(inode); + inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; btrfs_update_inode(trans, root, dir); out: @@ -3089,6 +3091,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); if (ret) @@ -3638,6 +3641,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid) { setattr_copy(inode, attr); + inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) @@ -4730,6 +4734,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); + inode_inc_iversion(parent_inode); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, parent_inode); if (ret) @@ -4937,6 +4942,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_inc_nlink(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -6894,7 +6900,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->root = NULL; ei->space_info = NULL; ei->generation = 0; - ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; @@ -7193,6 +7198,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) btrfs_add_ordered_operation(trans, root, old_inode); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; @@ -7219,6 +7227,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_inode) { + inode_inc_iversion(new_inode); new_inode->i_ctime = CURRENT_TIME; if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -7490,6 +7499,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14f8e1faa46..fccde7402cf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } btrfs_update_iflags(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); @@ -2622,6 +2623,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5f8fca4195..bd6d143cea8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -769,7 +769,7 @@ static int btrfs_fill_super(struct super_block *sb, #ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - + sb->s_flags |= MS_I_VERSION; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk("btrfs: open_ctree failed\n"); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e7a5659087e..3f4e2d69e83 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (ret) goto out; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); -- cgit v1.2.3 From 30f8fe3e47c5bb5715aa80b2a2fa0cab8b218fae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 13:55:30 -0400 Subject: Btrfs: cache no acl on new inodes When running compilebench I noticed we were spending some time looking up acls on new inodes, which shouldn't be happening since there were no acls. This is because when we init acls on the inode after creating them we don't cache the fact there are no acls if there aren't any. Doing this adds a little bit of a bump to my compilebench runs. Thanks, Btrfs: cache no acl on new inodes Signed-off-by: Josef Bacik --- fs/btrfs/acl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 89b156d85d6..761e2cd8fed 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); + } else { + cache_no_acl(inode); } + } else { + cache_no_acl(inode); } failed: posix_acl_release(acl); -- cgit v1.2.3 From d7dbe9e7f64e72ec6548658857c5d92327a73633 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 14:00:51 -0400 Subject: Btrfs: fix compile warnings in extent_io.c These warnings are bogus since we will always have at least one page in an eb, but to make the compiler happy just set ret = 0 in these two cases. Thanks, Btrfs: fix compile warnings in extent_io.c These warnings are bogus since we will always have at least one page in an eb, but to make the compiler happy just set ret = 0 in these two cases. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 82b4829f00b..836fc37a437 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3163,7 +3163,7 @@ static int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; int rw = (epd->sync_io ? WRITE_SYNC : WRITE); - int ret; + int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); -- cgit v1.2.3 From 551ebb2d34304ee2abfe6b00d39ec65d5e4e8266 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 14:41:09 -0400 Subject: Btrfs: remove useless waiting and extra filemap work In btrfs_wait_ordered_range we have been calling filemap_fdata_write() twice because compression does strange things and then waiting. Then we look up ordered extents and if we find any we will always schedule_timeout(); once and then loop back around and do it all again. We will even check to see if there is delalloc pages on this range and loop again. So this patch gets rid of the multipe fdata_write() calls and just does filemap_write_and_wait(). In the case of compression we will still find the ordered extents and start those individually if we need to so that is ok, but in the normal buffered case we avoid all this weird overhead. Then in the case of the schedule_timeout(1), we don't need it. All callers either 1) don't care, they just want to make sure what they just wrote maeks it to disk or 2) are doing the lock()->lookup ordered->unlock->flush thing in which case it will lock and check for ordered extents _anyway_ so get back to them as quickly as possible. The delaloc check is simply not needed, this only catches the case where we write to the file again since doing the filemap_write_and_wait() and if the caller truly cares about that it will take care of everything itself. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bbf6d0d9aeb..9807750c625 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -621,19 +621,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } -again: + /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - /* The compression code will leave pages locked but return from - * writepage without setting the page writeback. Starting again - * with WB_SYNC_ALL will end up waiting for the IO to actually start. - */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + filemap_write_and_wait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; @@ -657,11 +649,6 @@ again: break; end--; } - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_DELALLOC, 0, NULL)) { - schedule_timeout(1); - goto again; - } } /* -- cgit v1.2.3 From 0885ef5b5601e9b007c383e77c172769b1f214fd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 15:09:39 -0400 Subject: Btrfs: do not do filemap_write_and_wait_range in fsync We already do the btrfs_wait_ordered_range which will do this for us, so just remove this call so we don't call it twice. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8aa8d7fe74d..cfc0ab915d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1497,14 +1497,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_btrfs_sync_file(file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; mutex_lock(&inode->i_mutex); - /* we wait first, since the writeback may change the inode */ + /* + * we wait first, since the writeback may change the inode, also wait + * ordered range does a filemape_write_and_wait_range which is why we + * don't do it above like other file systems. + */ root->log_batch++; - btrfs_wait_ordered_range(inode, 0, (u64)-1); + btrfs_wait_ordered_range(inode, start, end); root->log_batch++; /* -- cgit v1.2.3 From 0d2450abfa359ff94a2bee64a7daeba68c346c81 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 24 Apr 2012 22:59:16 +0300 Subject: btrfs: allow changing 'thread_pool' size at remount time Changing 'mount -oremount,thread_pool=2 /' didn't make any effect: maximum amount of worker threads is specified in 2 places: - in 'strict btrfs_fs_info::thread_pool_size' - in each worker struct: 'struct btrfs_workers::max_workers' 'mount -oremount' updated only 'btrfs_fs_info::thread_pool_size'. Fix it by pushing new maximum value to all created worker structures as well. Cc: Josef Bacik Cc: Chris Mason Reviewed-by: Josef Bacik Signed-off-by: Sergei Trofimovich --- fs/btrfs/super.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bd6d143cea8..2cd32175753 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -435,11 +435,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_thread_pool: intarg = 0; match_int(&args[0], &intarg); - if (intarg) { + if (intarg) info->thread_pool_size = intarg; - printk(KERN_INFO "btrfs: thread pool %d\n", - info->thread_pool_size); - } break; case Opt_max_inline: num = match_strdup(&args[0]); @@ -1118,6 +1115,40 @@ error_fs_info: return ERR_PTR(error); } +static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) +{ + spin_lock_irq(&workers->lock); + workers->max_workers = new_limit; + spin_unlock_irq(&workers->lock); +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + int new_pool_size, int old_pool_size) +{ + if (new_pool_size == old_pool_size) + return; + + fs_info->thread_pool_size = new_pool_size; + + printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", + old_pool_size, new_pool_size); + + btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); + btrfs_set_max_workers(&fs_info->workers, new_pool_size); + btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); + btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1137,6 +1168,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -1180,7 +1214,8 @@ restore: fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; fs_info->alloc_start = old_alloc_start; - fs_info->thread_pool_size = old_thread_pool_size; + btrfs_resize_thread_pool(fs_info, + old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; return ret; } -- cgit v1.2.3 From 2eec6c8102c62c540c637176271cfdb13d828d7b Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Thu, 26 Apr 2012 00:37:14 +0800 Subject: Fix minor type issues Address some minor type issues identified by sparse checker. Signed-off-by: Daniel J Blueman --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ulist.c | 4 ++-- fs/btrfs/ulist.h | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fccde7402cf..9ebb2c7145a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2916,7 +2916,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } - user_dest = (struct btrfs_ioctl_space_info *) + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 12f5147bd2b..ad993bc2df9 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit); * * The allocated ulist will be returned in an initialized state. */ -struct ulist *ulist_alloc(unsigned long gfp_mask) +struct ulist *ulist_alloc(gfp_t gfp_mask) { struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); @@ -144,7 +144,7 @@ EXPORT_SYMBOL(ulist_free); * unaltered. */ int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask) + gfp_t gfp_mask) { int i; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 2e25dec58ec..6568c352773 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -59,10 +59,9 @@ struct ulist { void ulist_init(struct ulist *ulist); void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); +struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, gfp_t gfp_mask); struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); #endif -- cgit v1.2.3 From f07c9a79f06cd33b1c9c2c4eacb60bafa7e3f310 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Thu, 26 Apr 2012 18:35:12 +0200 Subject: Btrfs: avoid buffer overrun in btrfs_printk The buffer read-overrun would be triggered by a printk format starting with , where N is a single digit. NUL-terminate after strncpy. Use memcpy, not strncpy, since we know the string we're copying fits in the destination buffer and contains no NUL byte. Signed-off-by: Jim Meyering --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2cd32175753..46b26650415 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) va_start(args, fmt); if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { - strncpy(lvl, fmt, 3); + memcpy(lvl, fmt, 3); + lvl[3] = '\0'; fmt += 3; type = logtypes[fmt[1] - '0']; } else -- cgit v1.2.3 From a27202fbe92b12eec895c36644440175de01d7a6 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Thu, 26 Apr 2012 18:36:56 +0200 Subject: Btrfs: NUL-terminate path buffer in DEV_INFO ioctl result A device with name of length BTRFS_DEVICE_PATH_NAME_MAX or longer would not be NUL-terminated in the DEV_INFO ioctl result buffer. Signed-off-by: Jim Meyering --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9ebb2c7145a..3d8ab27622c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2263,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) + if (dev->name) { strncpy(di_args->path, dev->name, sizeof(di_args->path)); - else + di_args->path[sizeof(di_args->path) - 1] = 0; + } else { di_args->path[0] = '\0'; + } out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) -- cgit v1.2.3 From f60d16a8923201bb27ad7c09016abab2818cf8ce Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Wed, 25 Apr 2012 21:24:17 +0200 Subject: Btrfs: avoid buffer overrun in mount option handling There is an off-by-one error: allocating room for a maximal result string but without room for a trailing NUL. That, can lead to returning a transformed string that is not NUL-terminated, and then to a caller reading beyond end of the malloc'd buffer. Rewrite to s/kzalloc/kmalloc/, remove unwarranted use of strncpy (the result is guaranteed to fit), remove dead strlen at end, and change a few variable names and comments. Reviewed-by: Josef Bacik Signed-off-by: Jim Meyering --- fs/btrfs/super.c | 67 ++++++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 41 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 46b26650415..96eb9fef7bd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -923,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode) */ static char *setup_root_args(char *args) { - unsigned copied = 0; - unsigned len = strlen(args) + 2; - char *pos; - char *ret; + unsigned len = strlen(args) + 2 + 1; + char *src, *dst, *buf; /* - * We need the same args as before, but minus + * We need the same args as before, but with this substitution: + * s!subvol=[^,]+!subvolid=0! * - * subvol=a - * - * and add - * - * subvolid=0 - * - * which is a difference of 2 characters, so we allocate strlen(args) + - * 2 characters. + * Since the replacement string is up to 2 bytes longer than the + * original, allocate strlen(args) + 2 + 1 bytes. */ - ret = kzalloc(len * sizeof(char), GFP_NOFS); - if (!ret) - return NULL; - pos = strstr(args, "subvol="); + src = strstr(args, "subvol="); /* This shouldn't happen, but just in case.. */ - if (!pos) { - kfree(ret); + if (!src) + return NULL; + + buf = dst = kmalloc(len, GFP_NOFS); + if (!buf) return NULL; - } /* - * The subvol=<> arg is not at the front of the string, copy everybody - * up to that into ret. + * If the subvol= arg is not at the start of the string, + * copy whatever precedes it into buf. */ - if (pos != args) { - *pos = '\0'; - strcpy(ret, args); - copied += strlen(args); - pos++; + if (src != args) { + *src++ = '\0'; + strcpy(buf, args); + dst += strlen(args); } - strncpy(ret + copied, "subvolid=0", len - copied); - - /* Length of subvolid=0 */ - copied += 10; + strcpy(dst, "subvolid=0"); + dst += strlen("subvolid=0"); /* - * If there is no , after the subvol= option then we know there's no - * other options and we can just return. + * If there is a "," after the original subvol=... string, + * copy that suffix into our buffer. Otherwise, we're done. */ - pos = strchr(pos, ','); - if (!pos) - return ret; + src = strchr(src, ','); + if (src) + strcpy(dst, src); - /* Copy the rest of the arguments into our buffer */ - strncpy(ret + copied, pos, len - copied); - copied += strlen(pos); - - return ret; + return buf; } static struct dentry *mount_subvol(const char *subvol_name, int flags, -- cgit v1.2.3 From 4e89915220e2f1341c757b610d0f0c3821f3a65f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 May 2012 13:52:09 -0400 Subject: Btrfs: do not check delalloc when updating disk_i_size We are checking delalloc to see if it is ok to update the i_size. There are 2 cases it stops us from updating 1) If there is delalloc between our current disk_i_size and this ordered extent 2) If there is delalloc between our current ordered extent and the next ordered extent These tests are racy however since we can set delalloc for these ranges at any time. Also for the first case if we notice there is delalloc between disk_i_size and our ordered extent we will not update disk_i_size and assume that when that delalloc bit gets written out it will update everything properly. However if we crash before that we will have file extents outside of our i_size, which is not good, so this test is dangerous as well as racy. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9807750c625..9565c028916 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -751,7 +751,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 disk_i_size; u64 new_i_size; u64 i_size_test; @@ -784,14 +783,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, goto out; } - /* - * we can't update the disk_isize if there are delalloc bytes - * between disk_i_size and this ordered extent - */ - if (test_range_bit(io_tree, disk_i_size, offset - 1, - EXTENT_DELALLOC, 0, NULL)) { - goto out; - } /* * walk backward from this ordered extent to disk_i_size. * if we find an ordered extent then we can't update disk i_size @@ -853,15 +844,11 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents. As long as there - * are no delalloc bytes in this area, it is safe to update - * disk_i_size to the end of the region. + * extent where there are no ordered extents, we can safely set + * disk_i_size to this. */ - if (i_size_test > offset && - !test_range_bit(io_tree, offset, i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { + if (i_size_test > offset) new_i_size = min_t(u64, i_size_test, i_size); - } BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: -- cgit v1.2.3 From 5fd02043553b02867b29de1ac9fff2ec16b84def Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 May 2012 14:00:54 -0400 Subject: Btrfs: finish ordered extents in their own thread We noticed that the ordered extent completion doesn't really rely on having a page and that it could be done independantly of ending the writeback on a page. This patch makes us not do the threaded endio stuff for normal buffered writes and direct writes so we can end page writeback as soon as possible (in irq context) and only start threads to do the ordered work when it is actually done. Compression needs to be reworked some to take advantage of this as well, but atm it has to do a find_get_page in its endio handler so it must be done in its own thread. This makes direct writes quite a bit faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 12 --- fs/btrfs/extent_io.c | 15 +--- fs/btrfs/extent_io.h | 5 +- fs/btrfs/free-space-cache.c | 4 +- fs/btrfs/inode.c | 177 +++++++++++++++++++------------------------- fs/btrfs/ordered-data.c | 129 ++++++++++++++++++-------------- fs/btrfs/ordered-data.h | 13 +++- 7 files changed, 164 insertions(+), 191 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7ffc88a7db..19f5b450f40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3671,17 +3671,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state) -{ - struct super_block *sb = page->mapping->host->i_sb; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - btrfs_error(fs_info, -EIO, - "Error occured while writing out btree at %llu", start); - return -EIO; -} - static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3689,5 +3678,4 @@ static struct extent_io_ops btree_extent_io_ops = { .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, - .writepage_io_failed_hook = btree_writepage_io_failed_hook, }; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 836fc37a437..7af93435cee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1172,9 +1172,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, cached_state, mask); } -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, - gfp_t mask) +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, cached_state, mask); @@ -2221,17 +2220,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) uptodate = 0; } - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(NULL, page, - start, end, NULL); - /* Writeback already completed */ - if (ret == 0) - return 1; - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b516c3b8dec..4d8124b6457 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -75,9 +75,6 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -225,6 +222,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 202008ec367..cecf8df6248 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -972,9 +972,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - goto out; + btrfs_wait_ordered_range(inode, 0, (u64)-1); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41a62e6954c..9a1b96fd672 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { static int btrfs_setsize(struct inode *inode, loff_t newsize); static int btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -1572,11 +1572,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(root, inode)) metadata = 2; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); - if (ret) - return ret; - if (!(rw & REQ_WRITE)) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + return ret; + if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); @@ -1815,25 +1815,24 @@ out: * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { + struct inode *inode = ordered_extent->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans = NULL; - struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; int compress_type = 0; int ret; bool nolock; - ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1); - if (!ret) - return 0; - BUG_ON(!ordered_extent); /* Logic error */ - nolock = btrfs_is_free_space_inode(root, inode); + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + ret = -EIO; + goto out; + } + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); @@ -1889,12 +1888,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset, ordered_extent->len); } - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); + if (ret < 0) { btrfs_abort_transaction(trans, root, ret); - goto out; + goto out_unlock; } add_pending_csums(trans, inode, ordered_extent->file_offset, @@ -1905,10 +1902,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, root, ret); - goto out; + goto out_unlock; } } ret = 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); out: if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); @@ -1919,26 +1920,57 @@ out: btrfs_end_transaction(trans, root); } + if (ret) + clear_extent_uptodate(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, NULL, GFP_NOFS); + + /* + * This needs to be dont to make sure anybody waiting knows we are done + * upating everything for this ordered extent. + */ + btrfs_remove_ordered_extent(inode, ordered_extent); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - return 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); - goto out; + return ret; +} + +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); } static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_workers *workers; + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - return btrfs_finish_ordered_io(page->mapping->host, start, end); + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) + return 0; + + ordered_extent->work.func = finish_ordered_fn; + ordered_extent->work.flags = 0; + + if (btrfs_is_free_space_inode(root, inode)) + workers = &root->fs_info->endio_freespace_worker; + else + workers = &root->fs_info->endio_write_workers; + btrfs_queue_worker(workers, &ordered_extent->work); + + return 0; } /* @@ -5909,9 +5941,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; - struct extent_state *cached_state = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; int ret; @@ -5921,73 +5951,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes); + ordered_bytes, !err); if (!ret) goto out_test; - BUG_ON(!ordered); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - err = -ENOMEM; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret) - err = btrfs_update_inode_fallback(trans, root, inode); - goto out; - } - - lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, 0, - &cached_state); - - if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - ret = btrfs_mark_extent_written(trans, inode, - ordered->file_offset, - ordered->file_offset + - ordered->len); - if (ret) { - err = ret; - goto out_unlock; - } - } else { - ret = insert_reserved_file_extent(trans, inode, - ordered->file_offset, - ordered->start, - ordered->disk_len, - ordered->len, - ordered->len, - 0, 0, 0, - BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered->file_offset, ordered->len); - if (ret) { - err = ret; - WARN_ON(1); - goto out_unlock; - } - } - - add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode_fallback(trans, root, inode); - ret = 0; -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, - &cached_state, GFP_NOFS); -out: - btrfs_delalloc_release_metadata(inode, ordered->len); - btrfs_end_transaction(trans, root); - ordered_offset = ordered->file_offset + ordered->len; - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - + ordered->work.func = finish_ordered_fn; + ordered->work.flags = 0; + btrfs_queue_worker(&root->fs_info->endio_write_workers, + &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -5996,12 +5967,12 @@ out_test: if (ordered_offset < dip->logical_offset + dip->bytes) { ordered_bytes = dip->logical_offset + dip->bytes - ordered_offset; + ordered = NULL; goto again; } out_done: bio->bi_private = dip->private; - kfree(dip->csums); kfree(dip); /* If we had an error make sure to clear the uptodate flag */ @@ -6069,9 +6040,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int ret; bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto err; + + if (!write) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + } if (skip_sum) goto map; @@ -6491,13 +6465,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) static void btrfs_invalidatepage(struct page *page, unsigned long offset) { + struct inode *inode = page->mapping->host; struct extent_io_tree *tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - /* * we have the page locked, so new writeback can't start, * and the dirty bit won't be cleared while we are here. @@ -6507,13 +6481,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ wait_on_page_writeback(page); - tree = &BTRFS_I(page->mapping->host)->io_tree; + tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(page->mapping->host, + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* @@ -6528,9 +6502,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * whoever cleared the private bit is responsible * for the finish_ordered_io */ - if (TestClearPagePrivate2(page)) { - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + if (TestClearPagePrivate2(page) && + btrfs_dec_test_ordered_pending(inode, &ordered, page_start, + PAGE_CACHE_SIZE, 1)) { + btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); cached_state = NULL; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9565c028916..9e138cdc36c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->len = len; entry->disk_len = disk_len; entry->bytes_left = len; - entry->inode = inode; + entry->inode = igrab(inode); entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, trace_btrfs_ordered_extent_add(inode, entry); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) ordered_data_tree_panic(inode, -EEXIST, file_offset); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_inode_tree *tree; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); } /* @@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode, */ int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size) + u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; int ret; + unsigned long flags; u64 dec_end; u64 dec_start; u64 to_dec; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); if (!node) { ret = 1; @@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, (unsigned long long)to_dec); } entry->bytes_left -= to_dec; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -332,7 +336,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -347,15 +351,21 @@ out: */ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size) + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; int ret; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: if (!offset_in_entry(entry, file_offset)) { ret = 1; goto out; @@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, (unsigned long long)io_size); } entry->bytes_left -= io_size; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -383,7 +397,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + if (entry->inode) + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree lock - * while you call this function. + * and waiters are woken up. */ -static void __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_unlock_irq(&tree->lock); spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * remove an ordered extent from the tree. No references are dropped - * but any waiters are woken. - */ -void btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - __btrfs_remove_ordered_extent(inode, entry); - spin_unlock(&tree->lock); wake_up(&entry->wait); } @@ -663,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -674,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (entry) atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -690,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) { node = tree_search(tree, file_offset + len); @@ -715,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, out: if (entry) atomic_inc(&entry->refs); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -731,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -739,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -765,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -803,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } node = prev; } - while (node) { + for (; node; node = rb_prev(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + /* We treat this entry as if it doesnt exist */ + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; if (test->file_offset + test->len <= disk_i_size) break; if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; - node = rb_prev(node); } new_i_size = min_t(u64, offset, i_size); @@ -829,17 +834,27 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else node = rb_first(&tree->tree); } - i_size_test = 0; - if (node) { - /* - * do we have an area where IO might have finished - * between our ordered extent and the next one. - */ + + /* + * We are looking for an area between our current extent and the next + * ordered extent to update the i_size to. There are 3 cases here + * + * 1) We don't actually have anything and we can update to i_size. + * 2) We have stuff but they already did their i_size update so again we + * can just update to i_size. + * 3) We have an outstanding ordered extent so the most we can update + * our disk_i_size to is the start of the next offset. + */ + i_size_test = i_size; + for (; node; node = rb_next(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > offset) + + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; + if (test->file_offset > offset) { i_size_test = test->file_offset; - } else { - i_size_test = i_size; + break; + } } /* @@ -853,15 +868,15 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, ret = 0; out: /* - * we need to remove the ordered extent with the tree lock held - * so that other people calling this function don't find our fully - * processed ordered entry and skip updating the i_size + * We need to do this because we can't remove ordered extents until + * after the i_disk_size has been updated and then the inode has been + * updated to reflect the change, so we need to tell anybody who finds + * this ordered extent that we've already done all the real work, we + * just haven't completed all the other work. */ if (ordered) - __btrfs_remove_ordered_extent(inode, ordered); - spin_unlock(&tree->lock); - if (ordered) - wake_up(&ordered->wait); + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); + spin_unlock_irq(&tree->lock); return ret; } @@ -886,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, if (!ordered) return 1; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; @@ -901,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, } } out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); return ret; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c355ad4dc1a..e03c560d299 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,6 +74,12 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ +#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ + +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent + * has done its due diligence in updating + * the isize. */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -113,6 +119,8 @@ struct btrfs_ordered_extent { /* a per root list of all the pending ordered extents */ struct list_head root_extent_list; + + struct btrfs_work work; }; @@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size); + u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size); + u64 *file_offset, u64 io_size, + int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, -- cgit v1.2.3 From f8c5d0b443ff87c43ba690fa2b5bd2c9387d8624 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 10 May 2012 18:10:38 +0800 Subject: Btrfs: fix wrong error returned by adding a device Reproduce: $ mkfs.btrfs /dev/sdb7 $ mount /dev/sdb7 /mnt/btrfs -o ro $ btrfs dev add /dev/sdb8 /mnt/btrfs ERROR: error adding the device '/dev/sdb8' - Invalid argument Since we mount with readonly options, and /dev/sdb7 is not a seeding one, a readonly notification is preferred. Signed-off-by: Liu Bo Reviewed-by: Josef Bacik --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1411b99555a..48a06d1fc06 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1633,7 +1633,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) int ret = 0; if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) - return -EINVAL; + return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); -- cgit v1.2.3 From d1ac6e41d5437385957fd708e285defd0b1a430c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 10 May 2012 18:10:39 +0800 Subject: Btrfs: use fastpath in extent state ops as much as possible Fully utilize our extent state's new helper functions to use fastpath as much as possible. Signed-off-by: Liu Bo Reviewed-by: Josef Bacik --- fs/btrfs/extent_io.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7af93435cee..69a527c7a0b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -569,10 +569,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - clear_state_bit(tree, state, &bits, wake); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; + state = clear_state_bit(tree, state, &bits, wake); + goto next; } goto search_again; } @@ -780,7 +778,6 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; @@ -788,20 +785,15 @@ hit_next: } set_state_bits(tree, state, &bits); - cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -844,6 +836,10 @@ hit_next: if (last_end == (u64)-1) goto out; start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -993,21 +989,14 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; - set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -1041,10 +1030,13 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } -- cgit v1.2.3 From 9ba1f6e44ed7a1fa52d3f292508bf921b5054172 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 11 May 2012 18:11:26 +0800 Subject: Btrfs: do not do balance in readonly mode In normal cases, we would not be allowed to do balance in RO mode. However, when we're using a seeding device and adding another device to sprout, things will change: $ mkfs.btrfs /dev/sdb7 $ btrfstune -S 1 /dev/sdb7 $ mount /dev/sdb7 /mnt/btrfs -o ro $ btrfs fi bal /mnt/btrfs -----------------------> fail. $ btrfs dev add /dev/sdb8 /mnt/btrfs $ btrfs fi bal /mnt/btrfs -----------------------> works! It should not be designed as an exception, and we'd better add another check for mnt flags. Signed-off-by: Liu Bo Reviewed-by: Josef Bacik --- fs/btrfs/ioctl.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3d8ab27622c..15baf945630 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3216,8 +3216,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, } } -static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_balance(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; @@ -3229,6 +3230,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -3295,6 +3300,7 @@ out_bargs: out: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + mnt_drop_write(file->f_path.mnt); return ret; } @@ -3390,7 +3396,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(root, NULL); + return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3423,7 +3429,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); case BTRFS_IOC_BALANCE_V2: - return btrfs_ioctl_balance(root, argp); + return btrfs_ioctl_balance(file, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(root, arg); case BTRFS_IOC_BALANCE_PROGRESS: -- cgit v1.2.3 From cd023e7b17fe86c530475da210b3348421c40e5f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 May 2012 10:06:40 -0400 Subject: Btrfs: merge contigous regions when loading free space cache When we write out the free space cache we will write out everything that is in our in memory tree, and then we will just walk the pinned extents tree and write anything we see there. The problem with this is that during normal operations the pinned extents will be merged back into the free space tree normally, and then we can allocate space from the merged areas and commit them to the tree log. If we crash and replay the tree log we will crash again because the tree log will try to free up space from what looks like 2 seperate but contiguous entries, since one entry is from the original free space cache and the other was a pinned extent that was merged back. To fix this we just need to walk the free space tree after we load it and merge contiguous entries back together. This will keep the tree log stuff from breaking and it will make the allocator behave more nicely. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cecf8df6248..19a0d85b451 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -33,6 +33,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -584,6 +586,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, return 0; } +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries. This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache. So run through the space cache that we just + * loaded and merge contiguous entries. This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *e, *prev = NULL; + struct rb_node *n; + +again: + spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + e = rb_entry(n, struct btrfs_free_space, offset_index); + if (!prev) + goto next; + if (e->bitmap || prev->bitmap) + goto next; + if (prev->offset + prev->bytes == e->offset) { + unlink_free_space(ctl, prev); + unlink_free_space(ctl, e); + prev->bytes += e->bytes; + kmem_cache_free(btrfs_free_space_cachep, e); + link_free_space(ctl, prev); + prev = NULL; + spin_unlock(&ctl->tree_lock); + goto again; + } +next: + prev = e; + } + spin_unlock(&ctl->tree_lock); +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) @@ -726,6 +766,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } io_ctl_drop_pages(&io_ctl); + merge_space_tree(ctl); ret = 1; out: io_ctl_free(&io_ctl); -- cgit v1.2.3 From 72ac3c0d7921f943d92d1ef42a549fb52e56817d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 May 2012 14:13:11 -0400 Subject: Btrfs: convert the inode bit field to use the actual bit operations Miao pointed this out while I was working on an orphan problem that messing with a bitfield where different ranges are protected by different locks doesn't work out right. Turns out we've been doing this forever where we have different parts of the bit field protected by either no lock at all or different locks which could cause all sorts of weird problems including the issue I was hitting. So instead make a runtime_flags thing that we use the normal bit operations on that are all atomic so we can keep having our no/different locking for the different flags and then make force_compress it's own thing so it can be treated normally. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 30 ++++++++++++++++-------------- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/extent-tree.c | 11 ++++++----- fs/btrfs/file.c | 12 ++++++------ fs/btrfs/inode.c | 28 ++++++++++++---------------- 6 files changed, 44 insertions(+), 44 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3771b8543a7..6265edb219e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -24,6 +24,19 @@ #include "ordered-data.h" #include "delayed-inode.h" +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 +#define BTRFS_INODE_ORPHAN_META_RESERVED 1 +#define BTRFS_INODE_DUMMY 2 +#define BTRFS_INODE_IN_DEFRAG 3 +#define BTRFS_INODE_DELALLOC_META_RESERVED 4 + /* in memory btrfs inode */ struct btrfs_inode { /* which subvolume this inode belongs to */ @@ -78,6 +91,8 @@ struct btrfs_inode { /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; + unsigned long runtime_flags; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -141,23 +156,10 @@ struct btrfs_inode { unsigned outstanding_extents; unsigned reserved_extents; - /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - */ - unsigned ordered_data_close:1; - unsigned orphan_meta_reserved:1; - unsigned dummy_inode:1; - unsigned in_defrag:1; - unsigned delalloc_meta_reserved:1; - /* * always compress this one file */ - unsigned force_compress:4; + unsigned force_compress; struct btrfs_delayed_node *delayed_node; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bcd40c7109f..c18d0442ae6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); - if (BTRFS_I(inode)->delalloc_meta_reserved) { - BTRFS_I(inode)->delalloc_meta_reserved = 0; + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { spin_unlock(&BTRFS_I(inode)->lock); release = true; goto migrate; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 19f5b450f40..0cf8ef2b5b1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2001,7 +2001,8 @@ int open_ctree(struct super_block *sb, BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); - BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 59ae191d4f9..1902726fa70 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode) BTRFS_I(inode)->outstanding_extents--; if (BTRFS_I(inode)->outstanding_extents == 0 && - BTRFS_I(inode)->delalloc_meta_reserved) { + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) drop_inode_space = 1; - BTRFS_I(inode)->delalloc_meta_reserved = 0; - } /* * If we have more or the same amount of outsanding extents than we have @@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * Add an item to reserve for updating the inode when we complete the * delalloc io. */ - if (!BTRFS_I(inode)->delalloc_meta_reserved) { + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { nr_extents++; extra_reserve = 1; } @@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); if (extra_reserve) { - BTRFS_I(inode)->delalloc_meta_reserved = 1; + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); nr_extents--; } BTRFS_I(inode)->reserved_extents += nr_extents; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index cfc0ab915d0..c9005f21697 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -103,7 +103,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, goto exists; } } - BTRFS_I(inode)->in_defrag = 1; + set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); return; @@ -131,7 +131,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (btrfs_fs_closing(root->fs_info)) return 0; - if (BTRFS_I(inode)->in_defrag) + if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) return 0; if (trans) @@ -148,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!BTRFS_I(inode)->in_defrag) + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else kfree(defrag); @@ -252,7 +252,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) goto next; /* do a chunk of defrag */ - BTRFS_I(inode)->in_defrag = 0; + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); range.start = defrag->last_offset; num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, defrag_batch); @@ -1465,8 +1465,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. */ - if (BTRFS_I(inode)->ordered_data_close) { - BTRFS_I(inode)->ordered_data_close = 0; + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) { btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9a1b96fd672..91ad6390175 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2182,10 +2182,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; } - if (!BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 1; + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) reserve = 1; - } spin_unlock(&root->orphan_lock); /* grab metadata reservation from transaction handle */ @@ -2233,10 +2232,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) delete_item = 1; } - if (BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 0; + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) release_rsv = 1; - } spin_unlock(&root->orphan_lock); if (trans && delete_item) { @@ -3642,7 +3640,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) * any new writes get down to disk quickly. */ if (newsize == 0) - BTRFS_I(inode)->ordered_data_close = 1; + set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags); /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); @@ -4102,7 +4101,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); - BTRFS_I(inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; inode->i_op = &btrfs_dir_ro_inode_operations; @@ -4406,7 +4405,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; bool nolock = false; - if (BTRFS_I(inode)->dummy_inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) @@ -4439,7 +4438,7 @@ int btrfs_dirty_inode(struct inode *inode) struct btrfs_trans_handle *trans; int ret; - if (BTRFS_I(inode)->dummy_inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; trans = btrfs_join_transaction(root); @@ -6752,7 +6751,8 @@ static int btrfs_truncate(struct inode *inode) * using truncate to replace the contents of the file will * end up with a zero length file after a crash. */ - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) btrfs_add_ordered_operation(trans, root, inode); while (1) { @@ -6889,11 +6889,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->outstanding_extents = 0; ei->reserved_extents = 0; - ei->ordered_data_close = 0; - ei->orphan_meta_reserved = 0; - ei->dummy_inode = 0; - ei->in_defrag = 0; - ei->delalloc_meta_reserved = 0; + ei->runtime_flags = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; -- cgit v1.2.3 From 8a35d95ff4680a456d3ce47df9638f33d4f54f20 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 May 2012 14:26:42 -0400 Subject: Btrfs: fix how we deal with the orphan block rsv Ceph was hitting this race where we would remove an inode from the per-root orphan list before we would release the space we had reserved for the inode. We actually don't need a list or anything, we just need to make sure the root doesn't try to free up the orphan reserve until after the inodes have released their reservations. So use an atomic counter instead of a list on the root and only decrement the counter after we've released our reservation. I've tested this as well as several others and we no longer see the warnings that you would see while running ceph. Thanks, Btrfs: fix how we deal with the orphan block rsv Ceph was hitting this race where we would remove an inode from the per-root orphan list before we would release the space we had reserved for the inode. We actually don't need a list or anything, we just need to make sure the root doesn't try to free up the orphan reserve until after the inodes have released their reservations. So use an atomic counter instead of a list on the root and only decrement the counter after we've released our reservation. I've tested this as well as several others and we no longer see the warnings that you would see while running ceph. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 4 +--- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 38 +++++++++++++++++++++----------------- 4 files changed, 24 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6265edb219e..ce2c9d60031 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -36,6 +36,7 @@ #define BTRFS_INODE_DUMMY 2 #define BTRFS_INODE_IN_DEFRAG 3 #define BTRFS_INODE_DELALLOC_META_RESERVED 4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 /* in memory btrfs inode */ struct btrfs_inode { @@ -70,9 +71,6 @@ struct btrfs_inode { /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; - /* for keeping track of orphaned inodes */ - struct list_head i_orphan; - /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd72331d60..aad2600718a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1375,7 +1375,7 @@ struct btrfs_root { struct list_head root_list; spinlock_t orphan_lock; - struct list_head orphan_list; + atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; int orphan_item_inserted; int orphan_cleanup_state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0cf8ef2b5b1..297e5a8ed93 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); - INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); @@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->orphan_inodes, 0); root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 91ad6390175..029892887fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2104,12 +2104,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; int ret; - if (!list_empty(&root->orphan_list) || + if (atomic_read(&root->orphan_inodes) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; spin_lock(&root->orphan_lock); - if (!list_empty(&root->orphan_list)) { + if (atomic_read(&root->orphan_inodes)) { spin_unlock(&root->orphan_lock); return; } @@ -2166,8 +2166,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) block_rsv = NULL; } - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { #if 0 /* * For proper ENOSPC handling, we should do orphan @@ -2180,6 +2180,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; #endif insert = 1; + atomic_dec(&root->orphan_inodes); } if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, @@ -2197,6 +2198,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); btrfs_abort_transaction(trans, root, ret); return ret; } @@ -2227,10 +2230,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) int ret = 0; spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - list_del_init(&BTRFS_I(inode)->i_orphan); + if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) delete_item = 1; - } if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &BTRFS_I(inode)->runtime_flags)) @@ -2242,8 +2244,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ } - if (release_rsv) + if (release_rsv) { btrfs_orphan_release_metadata(inode); + atomic_dec(&root->orphan_inodes); + } return 0; } @@ -2371,6 +2375,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } + printk(KERN_ERR "auto deleting %Lu\n", + found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ @@ -2382,9 +2388,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); + set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { @@ -3706,7 +3711,8 @@ void btrfs_evict_inode(struct inode *inode) btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { - BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)); goto no_delete; } @@ -6903,7 +6909,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); - INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); @@ -6948,13 +6953,12 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", (unsigned long long)btrfs_ino(inode)); - list_del_init(&BTRFS_I(inode)->i_orphan); + atomic_dec(&root->orphan_inodes); } - spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); -- cgit v1.2.3 From 2adcac1a7331d93a17285804819caa96070b231f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 23 May 2012 16:10:14 -0400 Subject: Btrfs: fall back to non-inline if we don't have enough space If cow_file_range_inline fails with ENOSPC we abort the transaction which isn't very nice. This really shouldn't be happening anyways but there's no sense in making it a horrible error when we can easily just go allocate normal data space for this stuff. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 029892887fc..92df0a5d1d9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); - if (ret) { + if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); return ret; + } else if (ret == -ENOSPC) { + return 1; } + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; -- cgit v1.2.3 From 762f2263260d576504aeb23d20f90120acdb025f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 May 2012 18:58:27 +0800 Subject: Btrfs: fix the same inode id problem when doing auto defragment Two files in the different subvolumes may have the same inode id, so The rb-tree which is used to manage the defragment object must take it into account. This patch fix this problem. Signed-off-by: Miao Xie --- fs/btrfs/file.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c9005f21697..2e63cdc2b09 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -65,6 +65,21 @@ struct inode_defrag { int cycled; }; +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + /* pop a record for an inode into the defrag tree. The lock * must be held already * @@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; + int ret; p = &root->fs_info->defrag_inodes.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - if (defrag->ino < entry->ino) + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) p = &parent->rb_left; - else if (defrag->ino > entry->ino) + else if (ret > 0) p = &parent->rb_right; else { /* if we're reinserting an entry for @@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, /* * must be called with the defrag_inodes lock held */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, +struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, + u64 root, u64 ino, struct rb_node **next) { struct inode_defrag *entry = NULL; + struct inode_defrag tmp; struct rb_node *p; struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; p = info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - if (ino < entry->ino) + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) p = parent->rb_left; - else if (ino > entry->ino) + else if (ret > 0) p = parent->rb_right; else return entry; } if (next) { - while (parent && ino > entry->ino) { + while (parent && __compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); entry = rb_entry(parent, struct inode_defrag, rb_node); } @@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; u64 first_ino = 0; + u64 root_objectid = 0; int num_defrag; int defrag_batch = 1024; @@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) n = NULL; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); + defrag = btrfs_find_defrag_inode(fs_info, root_objectid, + first_ino, &n); if (!defrag) { - if (n) - defrag = rb_entry(n, struct inode_defrag, rb_node); - else if (first_ino) { + if (n) { + defrag = rb_entry(n, struct inode_defrag, + rb_node); + } else if (root_objectid || first_ino) { + root_objectid = 0; first_ino = 0; continue; } else { @@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* remove it from the rbtree */ first_ino = defrag->ino + 1; + root_objectid = defrag->root; rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); if (btrfs_fs_closing(fs_info)) -- cgit v1.2.3 From d07eb9117050c9ed3f78296ebcc06128b52693be Mon Sep 17 00:00:00 2001 From: Asias He Date: Fri, 25 May 2012 11:10:21 +0800 Subject: btrfs: Drop unused function btrfs_abort_devices() 1) This function is not used anywhere. 2) Using the blk_abort_queue() to abort the queue seems not correct. blk_abort_queue() is used for timeout handling (block/blk-timeout.c). Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Cc: Jens Axboe Cc: linux-kernel@vger.kernel.org Signed-off-by: Asias He --- fs/btrfs/disk-io.c | 13 ------------- fs/btrfs/disk-io.h | 1 - 2 files changed, 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 297e5a8ed93..0f788c05906 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2903,19 +2903,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -/* Kill all outstanding I/O */ -void btrfs_abort_devices(struct btrfs_root *root) -{ - struct list_head *head; - struct btrfs_device *dev; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - blk_abort_queue(dev->bdev->bd_disk->queue); - } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -} - void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index ab1830aaf0e..05b3fab39f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); -- cgit v1.2.3 From 442a4f6308e694e0fa6025708bd5e4e424bbf51c Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 25 May 2012 16:06:08 +0200 Subject: Btrfs: add device counters for detected IO and checksum errors The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. Signed-off-by: Stefan Behrens --- fs/btrfs/disk-io.c | 13 +++++--- fs/btrfs/extent_io.c | 18 ++++++++-- fs/btrfs/ioctl.h | 19 +++++++++++ fs/btrfs/scrub.c | 65 +++++++++++++++++++++++++++--------- fs/btrfs/volumes.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 45 +++++++++++++++++++++++++ 6 files changed, 230 insertions(+), 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0f788c05906..46d474e74aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2557,18 +2557,19 @@ recovery_tree_root: static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { + struct btrfs_device *device = (struct btrfs_device *) + bh->b_private; + printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); + "I/O error on %s\n", device->name); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); } unlock_buffer(bh); put_bh(bh); @@ -2683,6 +2684,7 @@ static int write_dev_supers(struct btrfs_device *device, set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; } /* @@ -2741,6 +2743,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait) } if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; + if (!bio_flagged(bio, BIO_EOPNOTSUPP)) + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 69a527c7a0b..b3692c1373a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1913,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { /* try to remap that extent elsewhere? */ bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } @@ -2327,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); - if (ret) + if (ret) { + /* no IO indicated but software detected errors + * in the block, either checksum errors or + * issues with the contents */ + struct btrfs_root *root = + BTRFS_I(page->mapping->host)->root; + struct btrfs_device *device; + uptodate = 0; - else + device = btrfs_find_device_for_logical( + root, start, mirror); + if (device) + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } else { clean_io_failure(start, page); + } } if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 086e6bdae1c..5bf05e28b82 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -266,6 +266,25 @@ struct btrfs_ioctl_logical_ino_args { __u64 inodes; }; +enum btrfs_dev_stat_values { + /* disk I/O failure stats */ + BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not + * been written */ + + BTRFS_DEV_STAT_VALUES_MAX +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2f3d6f917fb..a38cfa4f251 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -50,7 +50,7 @@ struct scrub_dev; struct scrub_page { struct scrub_block *sblock; struct page *page; - struct block_device *bdev; + struct btrfs_device *dev; u64 flags; /* extent flags */ u64 generation; u64 logical; @@ -86,6 +86,7 @@ struct scrub_block { unsigned int header_error:1; unsigned int checksum_error:1; unsigned int no_io_error_seen:1; + unsigned int generation_error:1; /* also sets header_error */ }; }; @@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sdev->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sdev->stat_lock); sdev->stat.csum_errors++; spin_unlock(&sdev->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { spin_lock(&sdev->stat_lock); sdev->stat.verify_errors++; @@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); + if (sblock_bad->generation_error) + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + else + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); } if (sdev->readonly) @@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page = sblock->pagev + page_index; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, bdev is NULL */ - page->bdev = bbio->stripes[mirror_index].dev->bdev; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); if (!page->page) { @@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_page *page = sblock->pagev + page_num; DECLARE_COMPLETION_ONSTACK(complete); - if (page->bdev == NULL) { + if (page->dev->bdev == NULL) { page->io_error = 1; sblock->no_io_error_seen = 0; continue; @@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_bdev = page->bdev; + bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; @@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, h = (struct btrfs_header *)mapped_buffer; if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || - generation != le64_to_cpu(h->generation) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + BTRFS_UUID_SIZE)) { sblock->header_error = 1; + } else if (generation != le64_to_cpu(h->generation)) { + sblock->header_error = 1; + sblock->generation_error = 1; + } csum = h->csum; } else { if (!have_csum) @@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_bdev = page_bad->bdev; + bio->bi_bdev = page_bad->dev->bdev; bio->bi_sector = page_bad->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; @@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, /* this will also unplug the queue */ wait_for_completion(&complete); + if (!bio_flagged(bio, BIO_UPTODATE)) { + btrfs_dev_stat_inc_and_print(page_bad->dev, + BTRFS_DEV_STAT_WRITE_ERRS); + bio_put(bio); + return -EIO; + } bio_put(bio); } @@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) u64 mapped_size; void *p; u32 crc = ~(u32)0; - int fail = 0; + int fail_gen = 0; + int fail_cor = 0; u64 len; int index; @@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock) memcpy(on_disk_csum, s->csum, sdev->csum_size); if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) - ++fail; + ++fail_cor; if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) - ++fail; + ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) - ++fail; + ++fail_cor; len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; @@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) btrfs_csum_final(crc, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) - ++fail; + ++fail_cor; - if (fail) { + if (fail_cor + fail_gen) { /* * if we find an error in a super block, we just report it. * They will get written with the next transaction commit @@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) spin_lock(&sdev->stat_lock); ++sdev->stat.super_errors; spin_unlock(&sdev->stat_lock); + if (fail_cor) + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + else + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); } - return fail; + return fail_cor + fail_gen; } static void scrub_block_get(struct scrub_block *sblock) @@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->bdev = sdev->dev->bdev; + spage->dev = sdev->dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 48a06d1fc06..2915521f44e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include "compat.h" @@ -4001,13 +4002,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } +static void *merge_stripe_index_into_bio_private(void *bi_private, + unsigned int stripe_index) +{ + /* + * with single, dup, RAID0, RAID1 and RAID10, stripe_index is + * at most 1. + * The alternative solution (instead of stealing bits from the + * pointer) would be to allocate an intermediate structure + * that contains the old private pointer plus the stripe_index. + */ + BUG_ON((((uintptr_t)bi_private) & 3) != 0); + BUG_ON(stripe_index > 3); + return (void *)(((uintptr_t)bi_private) | stripe_index); +} + +static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) +{ + return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); +} + +static unsigned int extract_stripe_index_from_bio_private(void *bi_private) +{ + return (unsigned int)((uintptr_t)bi_private) & 3; +} + static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); int is_orig_bio = 0; - if (err) + if (err) { atomic_inc(&bbio->error); + if (err == -EIO || err == -EREMOTEIO) { + unsigned int stripe_index = + extract_stripe_index_from_bio_private( + bio->bi_private); + struct btrfs_device *dev; + + BUG_ON(stripe_index >= bbio->num_stripes); + dev = bbio->stripes[stripe_index].dev; + if (bio->bi_rw & WRITE) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_WRITE_ERRS); + else + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_READ_ERRS); + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_print_on_error(dev); + } + } if (bio == bbio->orig_bio) is_orig_bio = 1; @@ -4149,6 +4195,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; } bio->bi_private = bbio; + bio->bi_private = merge_stripe_index_into_bio_private( + bio->bi_private, (unsigned int)dev_nr); bio->bi_end_io = btrfs_end_bio; bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; @@ -4509,6 +4557,28 @@ int btrfs_read_sys_array(struct btrfs_root *root) return ret; } +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int ret; + u64 map_length = 0; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *device; + + BUG_ON(mirror_num == 0); + ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, + mirror_num); + if (ret) { + BUG_ON(bbio != NULL); + return NULL; + } + BUG_ON(mirror_num != bbio->mirror_num); + device = bbio->stripes[mirror_num - 1].dev; + kfree(bbio); + return device; +} + int btrfs_read_chunk_tree(struct btrfs_root *root) { struct btrfs_path *path; @@ -4583,3 +4653,23 @@ error: btrfs_free_path(path); return ret; } + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ + btrfs_dev_stat_inc(dev, index); + btrfs_dev_stat_print_on_error(dev); +} + +void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +{ + printk_ratelimited(KERN_ERR + "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + dev->name, + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, + BTRFS_DEV_STAT_GENERATION_ERRS)); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bb6b03f97aa..193b2835e6a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -22,6 +22,7 @@ #include #include #include "async-thread.h" +#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -106,6 +107,10 @@ struct btrfs_device { struct completion flush_wait; int nobarriers; + /* disk I/O failure stats. For detailed description refer to + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_dirty; /* counters need to be written to disk */ + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; struct btrfs_fs_devices { @@ -281,4 +286,44 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num); +void btrfs_dev_stat_print_on_error(struct btrfs_device *device); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); + +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, + int index) +{ + atomic_inc(dev->dev_stat_values + index); + dev->dev_stats_dirty = 1; +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, + int index) +{ + return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, + int index) +{ + int ret; + + ret = atomic_xchg(dev->dev_stat_values + index, 0); + dev->dev_stats_dirty = 1; + return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, + int index, unsigned long val) +{ + atomic_set(dev->dev_stat_values + index, val); + dev->dev_stats_dirty = 1; +} + +static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, + int index) +{ + btrfs_dev_stat_set(dev, index, 0); +} #endif -- cgit v1.2.3 From c11d2c236cc260b36ef644700fbe99bcc7e7da33 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 25 May 2012 16:06:09 +0200 Subject: Btrfs: add ioctl to get and reset the device stats An ioctl interface is added to get the device statistic counters. A second ioctl is added to atomically get and reset these counters. Signed-off-by: Stefan Behrens --- fs/btrfs/ioctl.c | 26 ++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 14 ++++++++++++++ fs/btrfs/volumes.c | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 3 +++ 4 files changed, 77 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 15baf945630..0f8c354c4c7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3046,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, + void __user *arg, int reset_after_read) +{ + struct btrfs_ioctl_get_dev_stats *sa; + int ret; + + if (reset_after_read && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_get_dev_stats(root, sa, reset_after_read); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3434,6 +3456,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_ctl(root, arg); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root, argp); + case BTRFS_IOC_GET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp, 0); + case BTRFS_IOC_GET_AND_RESET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp, 1); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 5bf05e28b82..497c530724c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -285,6 +285,16 @@ enum btrfs_dev_stat_values { BTRFS_DEV_STAT_VALUES_MAX }; +struct btrfs_ioctl_get_dev_stats { + __u64 devid; /* in */ + __u64 nr_items; /* in/out */ + + /* out values: */ + __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; + + __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -349,5 +359,9 @@ enum btrfs_dev_stat_values { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ + struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_get_dev_stats) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2915521f44e..a112b758822 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4673,3 +4673,37 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } + +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats, + int reset_after_read) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int i; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + printk(KERN_WARNING + "btrfs: get dev_stats failed, device not found\n"); + return -ENODEV; + } else if (reset_after_read) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (stats->nr_items > i) + stats->values[i] = + btrfs_dev_stat_read_and_reset(dev, i); + else + btrfs_dev_stat_reset(dev, i); + } + } else { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (stats->nr_items > i) + stats->values[i] = btrfs_dev_stat_read(dev, i); + } + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 193b2835e6a..6798f8674b1 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -290,6 +290,9 @@ struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, u64 logical, int mirror_num); void btrfs_dev_stat_print_on_error(struct btrfs_device *device); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats, + int reset_after_read); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) -- cgit v1.2.3 From 733f4fbbc1083aa343da739f46ee839705d6cfe3 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 25 May 2012 16:06:10 +0200 Subject: Btrfs: read device stats on mount, write modified ones during commit The device statistics are written into the device tree with each transaction commit. Only modified statistics are written. When a filesystem is mounted, the device statistics for each involved device are read from the device tree and used to initialize the counters. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 38 +++++++++++ fs/btrfs/disk-io.c | 7 ++ fs/btrfs/print-tree.c | 3 + fs/btrfs/transaction.c | 4 ++ fs/btrfs/volumes.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 4 ++ 6 files changed, 232 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aad2600718a..e176f8c551f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -823,6 +823,14 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1507,6 +1515,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_BALANCE_ITEM_KEY 248 +/* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + */ +#define BTRFS_DEV_STATS_KEY 249 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -2415,6 +2429,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* btrfs_dev_stats_item */ +static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index) +{ + u64 val; + + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 46d474e74aa..b0d49e21b0b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2354,6 +2354,13 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; + ret = btrfs_init_dev_stats(fs_info); + if (ret) { + printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", + ret); + goto fail_block_groups; + } + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f38e452486b..5e23684887e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + case BTRFS_DEV_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; }; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 36422254ef6..82b03afcbd9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "locking.h" #include "tree-log.h" #include "inode-map.h" +#include "volumes.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -758,6 +759,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_run_dev_stats(trans, root->fs_info); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a112b758822..7782020996f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -362,6 +364,7 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->dev_stats_valid = 0; device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); @@ -4654,6 +4657,162 @@ error: return ret; } +static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_reset(dev, i); +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + int i; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_dev_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + device->name, (unsigned long long)device->devid); + __btrfs_reset_dev_stats(device); + device->dev_stats_valid = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_reset(device, i); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_stats_item *ptr; + int ret; + int i; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + ret, device->name); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_set_dev_stats_value(eb, ptr, i, + btrfs_dev_stat_read(device, i)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->dev_stats_valid || !device->dev_stats_dirty) + continue; + + ret = update_dev_stat_item(trans, dev_root, device); + if (!ret) + device->dev_stats_dirty = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); @@ -4662,6 +4821,8 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { + if (!dev->dev_stats_valid) + return; printk_ratelimited(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", dev->name, @@ -4674,6 +4835,17 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) BTRFS_DEV_STAT_GENERATION_ERRS)); } +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ + printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + dev->name, + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats, int reset_after_read) @@ -4690,6 +4862,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root, printk(KERN_WARNING "btrfs: get dev_stats failed, device not found\n"); return -ENODEV; + } else if (!dev->dev_stats_valid) { + printk(KERN_WARNING + "btrfs: get dev_stats failed, not yet valid\n"); + return -ENODEV; } else if (reset_after_read) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (stats->nr_items > i) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6798f8674b1..3406a88ca83 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -109,6 +109,7 @@ struct btrfs_device { /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; int dev_stats_dirty; /* counters need to be written to disk */ atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; @@ -293,6 +294,9 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats, int reset_after_read); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) -- cgit v1.2.3 From 018642a1f197887058e97291460b890d296e8953 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 29 May 2012 18:10:13 +0900 Subject: Btrfs: return value of btrfs_read_buffer is checked correctly btrfs_read_buffer() has the possibility of returning the error. Therefore, I add the code in which the return value of btrfs_read_buffer() is checked. Signed-off-by: Tsutomu Itoh --- fs/btrfs/ctree.c | 6 +++++- fs/btrfs/tree-log.c | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 26847999c64..99fcad631a2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -739,7 +739,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) return -EIO; } else if (!uptodate) { - btrfs_read_buffer(cur, gen); + err = btrfs_read_buffer(cur, gen); + if (err) { + free_extent_buffer(cur); + return err; + } } } if (search_start == 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eb1ae908582..6f22a4fca8d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; level = btrfs_header_level(eb); @@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); @@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } WARN_ON(*level <= 0); if (path->nodes[*level-1]) -- cgit v1.2.3 From 22ee6985de7d3e81ec0cef9c6ba01b45ad1bafeb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 May 2012 16:57:49 -0400 Subject: Btrfs: check to see if the inode is in the log before fsyncing We have this check down in the actual logging code, but this is after we start a transaction and all that good stuff. So move the helper inode_in_log() out so we can call it in fsync() and avoid starting a transaction altogether and just exit if we've already fsync()'ed this file recently. You would notice this issue if you fsync()'ed a file over and over again until the transaction committed. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 13 +++++++++++++ fs/btrfs/file.c | 3 ++- fs/btrfs/tree-log.c | 17 +---------------- 3 files changed, 16 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ce2c9d60031..e616f8872e6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -199,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, return false; } +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == generation && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2e63cdc2b09..876cddd6b2f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1552,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * syncing */ smp_mb(); - if (BTRFS_I(inode)->last_trans <= + if (btrfs_inode_in_log(inode, root->fs_info->generation) || + BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6f22a4fca8d..425014bdc6a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3038,21 +3038,6 @@ out: return ret; } -static int inode_in_log(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); - if (BTRFS_I(inode)->logged_trans == trans->transid && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; -} - - /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -3093,7 +3078,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (inode_in_log(trans, inode)) { + if (btrfs_inode_in_log(inode, trans->transid)) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } -- cgit v1.2.3 From 5bdbeb2187a99d690b374a8c5ec9911fcbcfe739 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 May 2012 16:59:49 -0400 Subject: Btrfs: fix return code in drop_objectid_items So dpkg fsync()'s the file and the directory containing the file whenever it writes to a file which is really slow in btrfs. This is partly because fsync()'ing a directory _always_ committed the transaction instead of just going to the tree log. This is because drop_objectid_items() would return 1 since it does a btrfs_search_slot() which returns 1. In tree-log jargon this means that we have to commit the transaction to be safe. So just check if ret is greater than 0 and set it to 0 if it does. With this patch we now use the tree-log instead of committing the entire transaction, which is twice as fast on my box. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 425014bdc6a..2017d0ff511 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2667,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); } btrfs_release_path(path); + if (ret > 0) + ret = 0; return ret; } -- cgit v1.2.3 From 3d136a1131c66f7d26fb171e4c5b0b8baacd3129 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 3 Feb 2012 11:20:04 +0100 Subject: Btrfs: set ioprio of scrub readahead to idle Reduce ioprio class of scrub readahead threads to idle priority. This setting is fixed. This priority has shown the best performance during all measurements. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/reada.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e176f8c551f..1c665ebe47e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* ioprio of readahead is set to idle */ +#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index ac5d0108588..48a4882d8ad 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work) { struct reada_machine_work *rmw; struct btrfs_fs_info *fs_info; + int old_ioprio; rmw = container_of(work, struct reada_machine_work, work); fs_info = rmw->fs_info; kfree(rmw); + old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), + task_nice_ioprio(current)); + set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); + set_task_ioprio(current, old_ioprio); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 86ff7ffce0b93aed14df4c8dcedd05bb5e2fdfbc Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 24 Apr 2012 18:10:16 +0200 Subject: Btrfs: fix runtime warning in check-integrity check data mode If a file_extent_item was located at the very end of a leaf and there was not enough space to hold a full item, but there was enough space to hold one of type BTRFS_FILE_EXTENT_INLINE or PREALLOC, and it was only such a short item, a warning was printed anyway. This check is now fixed. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7f6cc359e44..ed761838932 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1428,6 +1428,28 @@ static int btrfsic_handle_extent_data( file_extent_item_offset = offsetof(struct btrfs_leaf, items) + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + (unsigned long long) + le64_to_cpu(file_extent_item.disk_bytenr)); + return 0; + } + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > block_ctx->len) { printk(KERN_INFO @@ -1452,9 +1474,6 @@ static int btrfsic_handle_extent_data( le64_to_cpu(file_extent_item.disk_bytenr), (unsigned long long)le64_to_cpu(file_extent_item.offset), (unsigned long long)num_bytes); - if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) - return 0; while (num_bytes > 0) { u32 chunk_len; int num_copies; -- cgit v1.2.3 From 48235a68a3d1db579fc20d9915815228a1825757 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 23 May 2012 17:57:49 +0200 Subject: Btrfs: fix false positive in check-integrity on unmount During unmount, it could happen that the integrity checker printed a warning message "attempt to free ... on umount which is not yet iodone" which turned out to be a false positive. Signed-off-by: Stefan Behrens --- fs/btrfs/check-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ed761838932..9cebb1fd6a3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -3337,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root, btrfsic_block_link_free(l); } - if (b_all->is_iodone) + if (b_all->is_iodone || b_all->never_written) btrfsic_block_free(b_all); else printk(KERN_INFO "btrfs: attempt to free %c-block" -- cgit v1.2.3 From 95a06077f7edbd00d32612562be4d857a5b7df54 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 29 May 2012 17:06:54 +0200 Subject: Btrfs: use delayed ref sequence numbers for all fs-tree updates The sequence number for delayed refs is needed to postpone certain delayed refs for a very short period while walking backrefs. Before the tree modification log, we thought we'd only have to hold back those references that don't have a counter operation. While now we've the tree mod log, we're rewinding fs tree blocks to a defined consistent state. We cannot know in advance for which tree block we'll be doing rewind operations later. Therefore, we must postpone all the delayed refs for fs-tree blocks, even those having a counter operation. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.h | 7 +++++++ fs/btrfs/delayed-ref.c | 10 ++++++---- fs/btrfs/delayed-ref.h | 19 ------------------- 3 files changed, 13 insertions(+), 23 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6ba21b12cec..f5f11a6c5e9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3121,4 +3121,11 @@ void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); +static inline int is_fstree(u64 rootid) +{ + if (rootid == BTRFS_FS_TREE_OBJECTID || + (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 69f22e3ab3b..13ae7b04790 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) + if (is_fstree(ref_root)) seq = inc_delayed_seq(delayed_refs); ref->seq = seq; @@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) + if (is_fstree(ref_root)) seq = inc_delayed_seq(delayed_refs); ref->seq = seq; @@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && + if (!is_fstree(ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + return 0; } @@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && + if (!is_fstree(ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index fd824467021..413927fb995 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -224,25 +224,6 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, u64 seq); -/* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. -- cgit v1.2.3 From 3301958b7c1dae8f0f5ded63aa881e0b71e78464 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 30 May 2012 18:05:21 +0200 Subject: Btrfs: add inodes before dropping the extent lock in find_all_leafs We must build up the inode list with the extent lock held after following indirect refs. This also requires an extension to ulists, which allows to modify the stored aux value in case a key already exists in the list. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 36 +++++++++++++++++++++++++++++++----- fs/btrfs/ulist.c | 11 ++++++++++- fs/btrfs/ulist.h | 2 ++ 3 files changed, 43 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0ac47f2834d..3f75895c919 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -106,6 +106,7 @@ struct __prelim_ref { struct btrfs_key key_for_search; int level; int count; + struct extent_inode_elem *inode_list; u64 parent; u64 wanted_disk_byte; }; @@ -166,6 +167,7 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, else memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + ref->inode_list = NULL; ref->level = level; ref->count = count; ref->parent = parent; @@ -181,14 +183,21 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, const u64 *extent_item_pos) { int ret; - int slot; + int slot = path->slots[level]; struct extent_buffer *eb = path->nodes[level]; struct btrfs_file_extent_item *fi; + struct extent_inode_elem *eie = NULL; u64 disk_byte; u64 wanted_objectid = key->objectid; add_parent: - ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (level == 0 && extent_item_pos) { + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); + if (ret < 0) + return ret; + } + ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); if (ret < 0) return ret; @@ -202,6 +211,7 @@ add_parent: * repeat this until we don't find any additional EXTENT_DATA items. */ while (1) { + eie = NULL; ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; @@ -346,6 +356,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; + ref->inode_list = + node ? (struct extent_inode_elem *)node->aux : 0; /* additional parents require new refs being added here */ while ((node = ulist_next(parents, &uiter))) { @@ -356,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; + new_ref->inode_list = + (struct extent_inode_elem *)node->aux; list_add(&new_ref->list, &ref->list); } ulist_reinit(parents); @@ -879,7 +893,7 @@ again: } if (ref->count && ref->parent) { struct extent_inode_elem *eie = NULL; - if (extent_item_pos) { + if (extent_item_pos && !ref->inode_list) { u32 bsz; struct extent_buffer *eb; bsz = btrfs_level_size(fs_info->extent_root, @@ -889,10 +903,22 @@ again: BUG_ON(!eb); ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); + ref->inode_list = eie; free_extent_buffer(eb); } - ret = ulist_add(refs, ref->parent, - (unsigned long)eie, GFP_NOFS); + ret = ulist_add_merge(refs, ref->parent, + (unsigned long)ref->inode_list, + (unsigned long *)&eie, GFP_NOFS); + if (!ret && extent_item_pos) { + /* + * we've recorded that parent, so we must extend + * its inode list here + */ + BUG_ON(!eie); + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } BUG_ON(ret < 0); } kfree(ref); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 17e68bdc307..2ef59400ad6 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -145,12 +145,21 @@ EXPORT_SYMBOL(ulist_free); */ int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, unsigned long gfp_mask) +{ + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long *old_aux, unsigned long gfp_mask) { int i; for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) + if (ulist->nodes[i].val == val) { + if (old_aux) + *old_aux = ulist->nodes[i].aux; return 0; + } } if (ulist->nnodes >= ulist->nodes_alloced) { diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 62d2574f775..f1b1bf00c5a 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -67,6 +67,8 @@ struct ulist *ulist_alloc(unsigned long gfp_mask); void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, unsigned long gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long *old_aux, unsigned long gfp_mask); struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); -- cgit v1.2.3 From 926dd8a640da1bbf7478eebea1c23a842fc9c890 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 31 May 2012 14:00:15 +0200 Subject: Btrfs: add missing spin_lock for insertion into tree mod log tree_mod_alloc calls __get_tree_mod_seq and must acquire a spinlock before doing so. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0954f1770fd..26e8dc1681b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -455,11 +455,11 @@ unlock: return ret; } -int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, - struct tree_mod_elem **tm_ret) +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, + struct tree_mod_elem **tm_ret) { struct tree_mod_elem *tm; - u64 seq = 0; + int seq; smp_mb(); if (list_empty(&fs_info->tree_mod_seq_list)) @@ -469,9 +469,22 @@ int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, if (!tm) return -ENOMEM; - __get_tree_mod_seq(fs_info, &tm->elem); - seq = tm->elem.seq; tm->elem.flags = 0; + spin_lock(&fs_info->tree_mod_seq_lock); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list, because no blocker exists. items + * are removed from the list only when the existing blocker is + * removed from the list. + */ + kfree(tm); + seq = 0; + } else { + __get_tree_mod_seq(fs_info, &tm->elem); + seq = tm->elem.seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); return seq; } -- cgit v1.2.3 From e9b7fd4d8b7c915cff353ca085b83bd19737396b Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 31 May 2012 14:59:09 +0200 Subject: Btrfs: add tree_mod_dont_log helper Replace duplicate code by small inline helper function. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 26e8dc1681b..c7c48489b96 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -455,14 +455,25 @@ unlock: return ret; } +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 1; + if (!eb) + return 0; + if (btrfs_header_level(eb) == 0) + return 1; + return 0; +} + static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { struct tree_mod_elem *tm; int seq; - smp_mb(); - if (list_empty(&fs_info->tree_mod_seq_list)) + if (tree_mod_dont_log(fs_info, NULL)) return 0; tm = *tm_ret = kzalloc(sizeof(*tm), flags); @@ -643,8 +654,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, int ret; int i; - smp_mb(); - if (list_empty(&fs_info->tree_mod_seq_list)) + if (tree_mod_dont_log(fs_info, NULL)) return; if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) @@ -691,11 +701,7 @@ static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, int ret; u32 nritems; - smp_mb(); - if (list_empty(&fs_info->tree_mod_seq_list)) - return; - - if (btrfs_header_level(eb) == 0) + if (tree_mod_dont_log(fs_info, eb)) return; nritems = btrfs_header_nritems(eb); -- cgit v1.2.3 From f395694c2cd76cb1882fa82dd37e761598367fe9 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 31 May 2012 15:02:32 +0200 Subject: Btrfs: fix tree mod log del_ptr Logging for del_ptr when we're not deleting the last pointer was wrong. This fixes both, duplicate log entries and log sequence. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c7c48489b96..63147c1315a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -540,9 +540,8 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, int ret; int i; - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (tree_mod_dont_log(fs_info, eb)) + return 0; for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, @@ -550,6 +549,10 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, BUG_ON(ret < 0); } + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; tm->move.dst_slot = dst_slot; @@ -4548,9 +4551,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); - } - - if (tree_mod_log && level) { + } else if (tree_mod_log && level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); -- cgit v1.2.3 From c31931088fd6cf953bd0868a2647b6c3928e6c96 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 31 May 2012 19:24:36 +0200 Subject: Btrfs: fix tree mod log rewinded level and rewinding of moved keys When we rewind REMOVE_WHILE_FREEING operations, there's code that allocates a fresh buffer instead of cloning the old one. Setting that buffer's level correctly was missing in this case. When rewinding a MOVE_KEYS operation, btrfs_node_key_ptr_offset(slot) was missing for memmove_extent_buffer()'s arguments. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 63147c1315a..b4534d918e4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1076,7 +1076,9 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, n--; break; case MOD_LOG_MOVE_KEYS: - memmove_extent_buffer(eb, tm->slot, tm->move.dst_slot, + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); break; case MOD_LOG_ROOT_REPLACE: @@ -1127,6 +1129,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, btrfs_set_header_backref_rev(eb_rewin, btrfs_header_backref_rev(eb)); btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); } else { eb_rewin = btrfs_clone_extent_buffer(eb); BUG_ON(!eb_rewin); @@ -2609,7 +2612,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, } again: - level = 0; b = get_old_root(root, time_seq); extent_buffer_get(b); level = btrfs_header_level(b); -- cgit v1.2.3 From e41f941a23115e84a8550b3d901a13a14b2edc2f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Mar 2012 09:46:47 -0400 Subject: Btrfs: move over to use ->update_time Btrfs had been doing it's own file_update_time so we could catch ENOSPC properly, so just update our btrfs_update_time to work with the new stuff and then we'll be fancy later. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 - fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 53 ++++++++++++++--------------------------------------- 3 files changed, 15 insertions(+), 41 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd72331d60..ba8743b9a82 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2922,7 +2922,6 @@ int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); int btrfs_dirty_inode(struct inode *inode); -int btrfs_update_time(struct file *file); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53bf2d764bb..974beb84ed6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1404,7 +1404,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - err = btrfs_update_time(file); + err = file_update_time(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ceb7b9c9edc..3c1723a9ae6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4431,46 +4431,18 @@ int btrfs_dirty_inode(struct inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -int btrfs_update_time(struct file *file) +static int btrfs_update_time(struct inode *inode, struct timespec *now, + int flags) { - struct inode *inode = file->f_path.dentry->d_inode; - struct timespec now; - int ret; - enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; - - /* First try to exhaust all avenues to not sync */ - if (IS_NOCMTIME(inode)) - return 0; - - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) - sync_it = S_MTIME; - - if (!timespec_equal(&inode->i_ctime, &now)) - sync_it |= S_CTIME; - - if (IS_I_VERSION(inode)) - sync_it |= S_VERSION; - - if (!sync_it) - return 0; - - /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) - return 0; - - /* Only change inode inside the lock region */ - if (sync_it & S_VERSION) + if (flags & S_VERSION) inode_inc_iversion(inode); - if (sync_it & S_CTIME) - inode->i_ctime = now; - if (sync_it & S_MTIME) - inode->i_mtime = now; - ret = btrfs_dirty_inode(inode); - if (!ret) - mark_inode_dirty_sync(inode); - mnt_drop_write(file->f_path.mnt); - return ret; + if (flags & S_CTIME) + inode->i_ctime = *now; + if (flags & S_MTIME) + inode->i_mtime = *now; + if (flags & S_ATIME) + inode->i_atime = *now; + return btrfs_dirty_inode(inode); } /* @@ -6576,7 +6548,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (!ret) { - ret = btrfs_update_time(vma->vm_file); + ret = file_update_time(vma->vm_file); reserved = 1; } if (ret) { @@ -7647,6 +7619,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -7657,6 +7630,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, @@ -7670,6 +7644,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; const struct dentry_operations btrfs_dentry_operations = { -- cgit v1.2.3 From 4d5a0565cebf12c2ef854e4ac1961f13a710a950 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 30 Apr 2012 11:17:25 +0200 Subject: Btrfs: remove call to btrfs_header_nritems with no effect This is a leftover from cleanup patch 559af821. Before the cleanup, btrfs_header_nritems was called inside an if condition. As it has no side effects we need to preserve here, it should simply be dropped. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d7a96cfdc50..836e4e03edc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1650,8 +1650,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - btrfs_header_nritems(mid); - left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); @@ -1681,7 +1679,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - btrfs_header_nritems(mid); } /* -- cgit v1.2.3 From f617e2fd52484fb74236a597d0f9068ec7d9d2dd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 14 Jun 2012 16:10:13 +0200 Subject: Btrfs: remove obsolete btrfs_next_leaf call from __resolve_indirect_ref When resolving indirect refs, we used to call btrfs_next_leaf in case we didn't find an exact match. While we should find exact matches most of the time, in case we don't, we must continue searching. Treating those matches differently depending on the level we're searching doesn't make sense. Even worse, we might end up searching for a key larger than the largest, in which case there is no next_leaf and subsequent jobs would fail. This commit drops the bogous lines. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3f75895c919..579131de1b3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -294,16 +294,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (level == 0) { - if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - eb = path->nodes[0]; - } - + if (level == 0) btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - } ret = add_all_parents(root, path, parents, level, &key, ref->wanted_disk_byte, extent_item_pos); -- cgit v1.2.3 From 8ba97a15e7d4f70b9af71fa1db86a28dd17ad1b2 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 4 Jun 2012 16:54:57 +0200 Subject: Btrfs: use btrfs_read_lock_root_node in get_old_root get_old_root could race with root node updates because we weren't locking the node early enough. Use btrfs_read_lock_root_node to grab the root locked in the very beginning and release the lock as soon as possible (just like btrfs_search_slot does). Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 836e4e03edc..2cde7b0a010 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1143,6 +1143,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, return eb_rewin; } +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ static inline struct extent_buffer * get_old_root(struct btrfs_root *root, u64 time_seq) { @@ -1151,6 +1158,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root; u64 old_generation; + eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); if (!tm) return root->node; @@ -1173,15 +1181,21 @@ get_old_root(struct btrfs_root *root, u64 time_seq) /* there's a root replace operation for the current root */ eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, root->nodesize); + } + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + if (!eb) + return NULL; + btrfs_tree_read_lock(eb); + if (old_root->logical != root->node->start) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(eb, root->root_key.objectid); } - if (!eb) - return NULL; btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); __tree_mod_log_rewind(eb, time_seq, tm); + extent_buffer_get(eb); return eb; } @@ -2612,9 +2626,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, again: b = get_old_root(root, time_seq); - extent_buffer_get(b); level = btrfs_header_level(b); - btrfs_tree_read_lock(b); p->locks[level] = BTRFS_READ_LOCK; while (b) { -- cgit v1.2.3 From a95236d99fa56766f11056903439f55fe5038bcf Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 5 Jun 2012 16:41:24 +0200 Subject: Btrfs: fix return value for __tree_mod_log_oldest_root In __tree_mod_log_oldest_root() we must return the found operation even if it's not a ROOT_REPLACE operation. Otherwise, the caller assumes that there are no operations to be rewinded and returns immediately. The code in the caller is modified to improve readability. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2cde7b0a010..50d7c99ddce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1023,6 +1023,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, looped = 1; } + /* if there's no old root to return, return what we found instead */ + if (!found) + found = tm; + return found; } @@ -1155,18 +1159,24 @@ get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; struct extent_buffer *eb; - struct tree_mod_root *old_root; + struct tree_mod_root *old_root = NULL; u64 old_generation; + u64 logical; eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); if (!tm) return root->node; - old_root = &tm->old_root; - old_generation = tm->generation; + if (tm->op == MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + } else { + logical = root->node->start; + } - tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); + tm = tree_mod_log_search(root->fs_info, logical, time_seq); /* * there was an item in the log when __tree_mod_log_oldest_root * returned. this one must not go away, because the time_seq passed to @@ -1174,26 +1184,23 @@ get_old_root(struct btrfs_root *root, u64 time_seq) */ BUG_ON(!tm); - if (old_root->logical == root->node->start) { - /* there are logged operations for the current root */ - eb = btrfs_clone_extent_buffer(root->node); - } else { - /* there's a root replace operation for the current root */ + if (old_root) eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, root->nodesize); - } + else + eb = btrfs_clone_extent_buffer(root->node); btrfs_tree_read_unlock(root->node); free_extent_buffer(root->node); if (!eb) return NULL; btrfs_tree_read_lock(eb); - if (old_root->logical != root->node->start) { + if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(eb, root->root_key.objectid); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); } - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); __tree_mod_log_rewind(eb, time_seq, tm); extent_buffer_get(eb); -- cgit v1.2.3 From 3d7806eca43e73a9721d2e09369200ed93036bd0 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 11 Jun 2012 08:29:29 +0200 Subject: Btrfs: add btrfs_next_old_leaf To make sense of the tree mod log, the backref walker not only needs btrfs_search_old_slot, but it also called btrfs_next_leaf, which in turn was calling btrfs_search_slot. This obviously didn't give the correct result. This commit adds btrfs_next_old_leaf, a drop-in replacement for btrfs_next_leaf with a time_seq parameter. If it is zero, it behaves exactly like btrfs_next_leaf. If it is non-zero, it will use btrfs_search_old_slot with this time_seq parameter. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 7 ++++--- fs/btrfs/ctree.c | 11 ++++++++++- fs/btrfs/ctree.h | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 579131de1b3..8f7d1237b7a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -179,7 +179,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, int level, - struct btrfs_key *key, u64 wanted_disk_byte, + struct btrfs_key *key, u64 time_seq, + u64 wanted_disk_byte, const u64 *extent_item_pos) { int ret; @@ -212,7 +213,7 @@ add_parent: */ while (1) { eie = NULL; - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_old_leaf(root, path, time_seq); if (ret < 0) return ret; if (ret) @@ -297,7 +298,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (level == 0) btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - ret = add_all_parents(root, path, parents, level, &key, + ret = add_all_parents(root, path, parents, level, &key, time_seq, ref->wanted_disk_byte, extent_item_pos); out: btrfs_free_path(path); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50d7c99ddce..cb76b2a1b90 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5016,6 +5016,12 @@ next: * returns < 0 on io errors. */ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq) { int slot; int level; @@ -5041,7 +5047,10 @@ again: path->keep_locks = 1; path->leave_spinning = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (time_seq) + ret = btrfs_search_old_slot(root, &key, path, time_seq); + else + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0151ca1ac65..db15e9e7b91 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2753,6 +2753,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq); static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { ++p->slots[0]; -- cgit v1.2.3 From 3310c36eef330766fd23d50796287ad764929e24 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 11 Jun 2012 10:52:38 +0200 Subject: Btrfs: fix race in tree mod log addition When adding to the tree modification log, we grab two locks at different stages. We must not drop the outer lock until we're done with section protected by the inner lock. This moves the unlock call for the outer lock to the appropriate position. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cb76b2a1b90..04b06bcf2ca 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -467,6 +467,15 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 0; } +/* + * This allocates memory and gets a tree modification sequence number when + * needed. + * + * Returns 0 when no sequence number is needed, < 0 on error. + * Returns 1 when a sequence number was added. In this case, + * fs_info->tree_mod_seq_lock was acquired and must be released by the caller + * after inserting into the rb tree. + */ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { @@ -491,11 +500,11 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, */ kfree(tm); seq = 0; + spin_unlock(&fs_info->tree_mod_seq_lock); } else { __get_tree_mod_seq(fs_info, &tm->elem); seq = tm->elem.seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); return seq; } @@ -521,7 +530,9 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static noinline int @@ -559,7 +570,9 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->move.nr_items = nr_items; tm->op = MOD_LOG_MOVE_KEYS; - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static noinline int @@ -580,7 +593,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static struct tree_mod_elem * -- cgit v1.2.3 From beb42dd793193a3d4e72970bfa73cd8810f63cea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 May 2012 15:35:17 -0400 Subject: Btrfs: pass locked_page into extent_clear_unlock_delalloc if theres an error While doing my enospc work I got a transaction abortion that resulted in a panic when we tried to unlock_page() an already unlocked page. This is because we aren't calling extent_clear_unlock_delalloc with the locked page so it was unlocking all the pages in the range. This is wrong since __extent_writepage expects to have the page locked still unless we return *page_started as 1. This should keep us from panicing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 92df0a5d1d9..ccd6355af73 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -830,7 +830,7 @@ static noinline int cow_file_range(struct inode *inode, if (IS_ERR(trans)) { extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, + start, end, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | @@ -963,7 +963,7 @@ out: out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, + start, end, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | -- cgit v1.2.3 From b939d1ab76b4aa816343cdbd8b44ce9929a3b9ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 11:06:33 -0400 Subject: Btrfs: fix locking in btrfs_destroy_delayed_refs The transaction abort stuff was throwing warnings from the list debugging code because we do a list_del_init outside of the delayed_refs spin lock. The delayed refs locking makes baby Jesus cry so it's not hard to get wrong, but we need to take the ref head mutex to make sure it's not being processed currently, and so if it is we need to drop the spin lock and then take and drop the mutex and do the search again. If we can take the mutex then we can safely remove the head from the list and carry on. Now when the transaction aborts I don't get the list debugging warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b99d5127ba1..c79ddc75608 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3400,7 +3400,6 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; -again: spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { spin_unlock(&delayed_refs->lock); @@ -3408,31 +3407,36 @@ again: return ret; } - node = rb_first(&delayed_refs->root); - while (node) { + while ((node = rb_first(&delayed_refs->root)) != NULL) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); - spin_unlock(&delayed_refs->lock); - mutex_lock(&head->mutex); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + /* Need to wait for the delayed ref to run */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + + continue; + } + kfree(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; list_del_init(&head->cluster); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; } + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); -- cgit v1.2.3 From d7096fc3ef8360f30f228344bc564d4f97d8a158 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:49:57 -0400 Subject: Btrfs: wake up transaction waiters when aborting a transaction I was getting lots of hung tasks and a NULL pointer dereference because we are not cleaning up the transaction properly when it aborts. First we need to reset the running_transaction to NULL so we don't get a bad dereference for any start_transaction callers after this. Also we cannot rely on waitqueue_active() since it's just a list_empty(), so just call wake_up() directly since that will do the barrier for us and such. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 9 +++------ fs/btrfs/transaction.c | 4 ++++ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c79ddc75608..19b4db70dcb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3589,16 +3589,13 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, /* FIXME: cleanup wait for commit */ cur_trans->in_commit = 1; cur_trans->blocked = 1; - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); + wake_up(&root->fs_info->transaction_blocked_wait); cur_trans->blocked = 0; - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_wait); cur_trans->commit_done = 1; - if (waitqueue_active(&cur_trans->commit_wait)) - wake_up(&cur_trans->commit_wait); + wake_up(&cur_trans->commit_wait); btrfs_destroy_pending_snapshots(cur_trans); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1791c6e3d83..59e0203bfb9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1221,6 +1221,10 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); + if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + } spin_unlock(&root->fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, root); -- cgit v1.2.3 From 7b8b92af58db347de64a237861fcf13374b34a9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:52:43 -0400 Subject: Btrfs: abort the transaction if the commit fails If a transaction commit fails we don't abort it so we don't set an error on the file system. This patch fixes that by actually calling the abort stuff and then adding a check for a fs error in the transaction start stuff to make sure it is caught properly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 59e0203bfb9..b72b068183e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -100,6 +100,10 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; + } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&root->fs_info->trans_lock); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + return -EROFS; } atomic_set(&cur_trans->num_writers, 1); @@ -1213,12 +1217,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, static void cleanup_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; WARN_ON(trans->use_count > 1); + btrfs_abort_transaction(trans, root, err); + spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { @@ -1530,7 +1536,7 @@ cleanup_transaction: // WARN_ON(1); if (current->journal_info == trans) current->journal_info = NULL; - cleanup_transaction(trans, root); + cleanup_transaction(trans, root, ret); return ret; } -- cgit v1.2.3 From ee670f0af35871edb492db5ba406cef36d1b7c21 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:54:30 -0400 Subject: Btrfs: fix btrfs_destroy_marked_extents So we're forcing the eb's to have their ref count set to 1 so invalidatepage works but this breaks lots of things, for example root nodes, and is just plain wrong, we don't need to just evict all of this stuff. Also drop the invalidatepage altogether and add a page_cache_release(). With this patch we no longer hang when trying to access the root nodes after an aborted transaction and we no longer leak memory. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 19b4db70dcb..5a3bf323e2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3524,11 +3524,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, offset >> PAGE_CACHE_SHIFT); spin_unlock(&dirty_pages->buffer_lock); - if (eb) { + if (eb) ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - atomic_set(&eb->refs, 1); - } if (PageWriteback(page)) end_page_writeback(page); @@ -3542,8 +3540,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, spin_unlock_irq(&page->mapping->tree_lock); } - page->mapping->a_ops->invalidatepage(page, 0); unlock_page(page); + page_cache_release(page); } } -- cgit v1.2.3 From 17ca04aff7e6171df684b7b65804df8830eb8c15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:58:55 -0400 Subject: Btrfs: unlock everything properly in the error case for nocow I was getting hung on umount when a transaction was aborted because a range of one of the free space inodes was still locked. This is because the nocow stuff doesn't unlock anything on error. This fixed the problem and I verified that is what was happening. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ccd6355af73..b7f398c36cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1136,8 +1136,18 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - if (!path) + if (!path) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); return -ENOMEM; + } nolock = btrfs_is_free_space_inode(root, inode); @@ -1147,6 +1157,15 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); btrfs_free_path(path); return PTR_ERR(trans); } @@ -1327,8 +1346,11 @@ out_check: } btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) + if (cur_offset <= end && cow_start == (u64)-1) { cow_start = cur_offset; + cur_offset = end; + } + if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); @@ -1347,6 +1369,17 @@ error: if (!ret) ret = err; + if (ret && cur_offset < end) + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + cur_offset, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 606686eeac4550d2212bf3d621a810407ef5e9bf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 4 Jun 2012 14:03:51 -0400 Subject: Btrfs: use rcu to protect device->name Al pointed out that we can just toss out the old name on a device and add a new one arbitrarily, so anybody who uses device->name in printk could possibly use free'd memory. Instead of adding locking around all of this he suggested doing it with RCU, so I've introduced a struct rcu_string that does just that and have gone through and protected all accesses to device->name that aren't under the uuid_mutex with rcu_read_lock(). This protects us and I will use it for dealing with removing the device that we used to mount the file system in a later patch. Thanks, Reviewed-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/check-integrity.c | 16 ++++---- fs/btrfs/disk-io.c | 10 +++-- fs/btrfs/extent_io.c | 7 ++-- fs/btrfs/ioctl.c | 13 +++++-- fs/btrfs/rcu-string.h | 56 ++++++++++++++++++++++++++++ fs/btrfs/scrub.c | 30 +++++++++------ fs/btrfs/volumes.c | 92 +++++++++++++++++++++++++++++----------------- fs/btrfs/volumes.h | 2 +- 8 files changed, 162 insertions(+), 64 deletions(-) create mode 100644 fs/btrfs/rcu-string.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9cebb1fd6a3..da6e9364a5e 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -93,6 +93,7 @@ #include "print-tree.h" #include "locking.h" #include "check-integrity.h" +#include "rcu-string.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -843,13 +844,14 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", - superblock_bdev, device->name, - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - superblock_mirror_num); + printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, + rcu_str_deref(device->name), + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(superblock_tmp, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5a3bf323e2b..1c9664b16cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "check-integrity.h" +#include "rcu-string.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -2575,8 +2576,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", device->name); + printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ @@ -2749,8 +2751,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk("btrfs: disabling barriers on dev %s\n", - device->name); + printk_in_rcu("btrfs: disabling barriers on dev %s\n", + rcu_str_deref(device->name)); device->nobarriers = 1; } if (!bio_flagged(bio, BIO_UPTODATE)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2c8f7b20461..aaa12c1eb34 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "check-integrity.h" #include "locking.h" +#include "rcu-string.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1917,9 +1918,9 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return -EIO; } - printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " - "sector %llu)\n", page->mapping->host->i_ino, start, - dev->name, sector); + printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 24b776c08d9..c5254dde1f0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -52,6 +52,7 @@ #include "locking.h" #include "inode-map.h" #include "backref.h" +#include "rcu-string.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -1345,8 +1346,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "btrfs: new size for %s is %llu\n", - device->name, (unsigned long long)new_size); + printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", + rcu_str_deref(device->name), + (unsigned long long)new_size); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); @@ -2264,7 +2266,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + strncpy(di_args->path, name->str, sizeof(di_args->path)); + rcu_read_unlock(); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h new file mode 100644 index 00000000000..9e111e4576d --- /dev/null +++ b/fs/btrfs/rcu-string.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct rcu_string { + struct rcu_head rcu; + char str[0]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ + size_t len = strlen(src) + 1; + struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + + (len * sizeof(char)), mask); + if (!ret) + return ret; + strncpy(ret->str, src, len); + return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ + if (str) + kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk_ratelimited(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define rcu_str_deref(rcu_str) ({ \ + struct rcu_string *__str = rcu_dereference(rcu_str); \ + __str->str; \ +}) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a38cfa4f251..b223620cd5a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -26,6 +26,7 @@ #include "backref.h" #include "extent_io.h" #include "check-integrity.h" +#include "rcu-string.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -320,10 +321,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, - swarn->logical, swarn->dev->name, + swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); @@ -332,10 +333,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) return 0; err: - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, - swarn->logical, swarn->dev->name, + swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); free_ipath(ipath); @@ -390,10 +391,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, dev->name, + "%llu\n", errstr, swarn.logical, + rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -580,9 +582,11 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + + printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, sdev->dev->name); + (unsigned long long)fixup->logical, + rcu_str_deref(sdev->dev->name)); } btrfs_free_path(path); @@ -936,18 +940,20 @@ corrected_error: spin_lock(&sdev->stat_lock); sdev->stat.corrected_errors++; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + (unsigned long long)logical, + rcu_str_deref(sdev->dev->name)); } } else { did_not_correct_error: spin_lock(&sdev->stat_lock); sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + (unsigned long long)logical, + rcu_str_deref(sdev->dev->name)); } out: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7782020996f..8a3d2594b80 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" +#include "rcu-string.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -64,7 +65,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } kfree(fs_devices); @@ -334,8 +335,8 @@ static noinline int device_list_add(const char *path, { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; + struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); - char *name; fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { @@ -369,11 +370,13 @@ static noinline int device_list_add(const char *path, memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); spin_lock_init(&device->io_lock); - device->name = kstrdup(path, GFP_NOFS); - if (!device->name) { + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) { kfree(device); return -ENOMEM; } + rcu_assign_pointer(device->name, name); INIT_LIST_HEAD(&device->dev_alloc_list); /* init readahead state */ @@ -390,12 +393,12 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; - } else if (!device->name || strcmp(device->name, path)) { - name = kstrdup(path, GFP_NOFS); + } else if (!device->name || strcmp(device->name->str, path)) { + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; - kfree(device->name); - device->name = name; + rcu_string_free(device->name); + rcu_assign_pointer(device->name, name); if (device->missing) { fs_devices->missing_devices--; device->missing = 0; @@ -430,15 +433,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { + struct rcu_string *name; + device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) goto error; - device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) { + /* + * This is ok to do without rcu read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { kfree(device); goto error; } + rcu_assign_pointer(device->name, name); device->devid = orig_dev->devid; device->work.func = pending_bios_fn; @@ -491,7 +501,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - kfree(device->name); + rcu_string_free(device->name); kfree(device); } @@ -516,7 +526,7 @@ static void __free_device(struct work_struct *work) if (device->bdev) blkdev_put(device->bdev, device->mode); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } @@ -540,6 +550,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; + struct rcu_string *name; if (device->bdev) fs_devices->open_devices--; @@ -555,8 +566,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device = kmalloc(sizeof(*new_device), GFP_NOFS); BUG_ON(!new_device); /* -ENOMEM */ memcpy(new_device, device, sizeof(*new_device)); - new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(device->name && !new_device->name); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(device->name && !name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; @@ -621,9 +635,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - bdev = blkdev_get_by_path(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name->str, flags, holder); if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name); + printk(KERN_INFO "open %s failed\n", device->name->str); goto error; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -1632,6 +1646,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct block_device *bdev; struct list_head *devices; struct super_block *sb = root->fs_info->sb; + struct rcu_string *name; u64 total_bytes; int seeding_dev = 0; int ret = 0; @@ -1671,23 +1686,24 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - device->name = kstrdup(device_path, GFP_NOFS); - if (!device->name) { + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { kfree(device); ret = -ENOMEM; goto error; } + rcu_assign_pointer(device->name, name); ret = find_next_devid(root, &device->devid); if (ret) { - kfree(device->name); + rcu_string_free(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - kfree(device->name); + rcu_string_free(device->name); kfree(device); ret = PTR_ERR(trans); goto error; @@ -1796,7 +1812,7 @@ error_trans: unlock_chunks(root); btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); - kfree(device->name); + rcu_string_free(device->name); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -4204,10 +4220,17 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { +#ifdef DEBUG + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - dev->name, dev->devid, bio->bi_size); + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); +#endif bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -4694,8 +4717,9 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", - device->name, (unsigned long long)device->devid); + printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + rcu_str_deref(device->name), + (unsigned long long)device->devid); __btrfs_reset_dev_stats(device); device->dev_stats_valid = 1; btrfs_release_path(path); @@ -4747,8 +4771,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", - ret, device->name); + printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + ret, rcu_str_deref(device->name)); goto out; } @@ -4757,8 +4781,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", - device->name, ret); + printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); goto out; } ret = 1; @@ -4770,8 +4794,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", - device->name, ret); + printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); goto out; } } @@ -4823,9 +4847,9 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", - dev->name, + rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -4837,8 +4861,8 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) { - printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", - dev->name, + printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3406a88ca83..74366f27a76 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; - char *name; + struct rcu_string *name; /* the internal btrfs device id */ u64 devid; -- cgit v1.2.3 From 9c5085c147989d48dfe74194b48affc23f376650 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Jun 2012 14:13:12 -0400 Subject: Btrfs: implement ->show_devname Because btrfs can remove the device that was mounted we need to have a ->show_devname so that in this case we can print out some other device in the file system to /proc/mount. So if there are multiple devices in a btrfs file system we will just print the device with the lowest devid that we can find. This will make everything consistent and deal with device removal properly. The drawback is if you mount with a device that is higher than the lowest devicd it won't show up as the mounted device in /proc/mounts, but this is a small price to pay. This was inspired by Miao Xie's patch. Thanks, Reviewed-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 96eb9fef7bd..0eb9a4da069 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,7 @@ #include "version.h" #include "export.h" #include "compression.h" +#include "rcu-string.h" #define CREATE_TRACE_POINTS #include @@ -1482,12 +1483,44 @@ static void btrfs_fs_dirty_inode(struct inode *inode, int flags) "error %d\n", btrfs_ino(inode), ret); } +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + struct btrfs_fs_devices *cur_devices; + struct btrfs_device *dev, *first_dev = NULL; + struct list_head *head; + struct rcu_string *name; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + cur_devices = fs_info->fs_devices; + while (cur_devices) { + head = &cur_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; + } + cur_devices = cur_devices->seed; + } + + if (first_dev) { + rcu_read_lock(); + name = rcu_dereference(first_dev->name); + seq_escape(m, name->str, " \t\n\\"); + rcu_read_unlock(); + } else { + WARN_ON(1); + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, + .show_devname = btrfs_show_devname, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_fs_dirty_inode, .alloc_inode = btrfs_alloc_inode, -- cgit v1.2.3 From 8180ef8894fa402443205cff1e23417e8d3434df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Jun 2012 15:16:12 -0400 Subject: Btrfs: keep inode pinned when compressing writes A user reported lots of problems using compression on the new code and it turns out part of the problem was that igrab() was failing when we added a new ordered extent. This is because when writing out an inode under compression we immediately return without actually doing anything to the pages, and then in another thread at some point down the line actually do the ordered dance. The problem is between the point that we start writeback and we actually add the ordered extent we could be trying to reclaim the inode, which makes igrab() return NULL. So we need to do an igrab() when we create the async extent and then drop it when we are done with it. This makes sure we stay pinned in memory until the ordered extent can get a reference on it and we are good to go. With this patch we no longer panic in btrfs_finish_ordered_io(). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b7f398c36cb..06075043da5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -986,8 +986,10 @@ static noinline void async_cow_start(struct btrfs_work *work) compress_file_range(async_cow->inode, async_cow->locked_page, async_cow->start, async_cow->end, async_cow, &num_added); - if (num_added == 0) + if (num_added == 0) { + iput(async_cow->inode); async_cow->inode = NULL; + } } /* @@ -1020,6 +1022,8 @@ static noinline void async_cow_free(struct btrfs_work *work) { struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); + if (async_cow->inode) + iput(async_cow->inode); kfree(async_cow); } @@ -1038,7 +1042,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = inode; + async_cow->inode = igrab(inode); async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; -- cgit v1.2.3 From 7ddf5a42d311d74fd9f7373cb56def0843c219f8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Jun 2012 15:26:47 -0400 Subject: Btrfs: call filemap_fdatawrite twice for compression I removed this in an earlier commit and I was wrong. Because compression can return from filemap_fdatawrite() without having actually set any of it's pages as writeback() it can make filemap_fdatawait() do essentially nothing, and then we won't find any ordered extents because they may not have been created yet. So not only does this make fsync() completely useless, but it will also screw up if you truncate on a non-page aligned offset since we zero out the end and then wait on ordered extents and then call drop caches. We can drop the cache before the io completes and then we try to unpin the extent we just wrote we won't find it and everything goes sideways. So fix this by putting it back and put a giant comment there to keep me from trying to remove it in the future. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/inode.c | 15 +++++++++------ fs/btrfs/ordered-data.c | 22 +++++++++++++++++++++- 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e616f8872e6..12394a90d60 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -37,6 +37,7 @@ #define BTRFS_INODE_IN_DEFRAG 3 #define BTRFS_INODE_DELALLOC_META_RESERVED 4 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06075043da5..7a090fb4eb9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1398,20 +1398,23 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) + } else if (!btrfs_test_opt(root, COMPRESS) && + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); - else + } else { + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); + } return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9e138cdc36c..643335a4fe3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -627,7 +627,27 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - filemap_write_and_wait_range(inode->i_mapping, start, orig_end); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; -- cgit v1.2.3 From 6c282eb40ed6f64a51ff447707714df551d85b8e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 Jun 2012 16:03:35 +0800 Subject: Btrfs: fix defrag regression If a file has 3 small extents: | ext1 | ext2 | ext3 | Running "btrfs fi defrag" will only defrag the last two extents, if those extent mappings hasn't been read into memory from disk. This bug was introduced by commit 17ce6ef8d731af5edac8c39e806db4c7e1f6956f ("Btrfs: add a check to decide if we should defrag the range") The cause is, that commit looked into previous and next extents using lookup_extent_mapping() only. While at it, remove the code that checks the previous extent, since it's sufficient to check the next extent. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 97 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c5254dde1f0..a98f7d25282 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -786,39 +786,57 @@ none: return -ENOENT; } -/* - * Validaty check of prev em and next em: - * 1) no prev/next em - * 2) prev/next em is an hole/inline extent - */ -static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *prev = NULL, *next = NULL; - int ret = 0; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 len = PAGE_CACHE_SIZE; + /* + * hopefully we have this extent in the tree already, try without + * the full extent lock + */ read_lock(&em_tree->lock); - prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); - next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); + em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); - if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && - (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) - ret = 1; - free_extent_map(prev); - free_extent_map(next); + if (!em) { + /* get the big lock and read metadata off disk */ + lock_extent(io_tree, start, start + len - 1); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + unlock_extent(io_tree, start, start + len - 1); + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +{ + struct extent_map *next; + bool ret = true; + + /* this is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + next = defrag_lookup_extent(inode, em->start + em->len); + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + + free_extent_map(next); return ret; } -static int should_defrag_range(struct inode *inode, u64 start, u64 len, - int thresh, u64 *last_len, u64 *skip, - u64 *defrag_end) +static int should_defrag_range(struct inode *inode, u64 start, int thresh, + u64 *last_len, u64 *skip, u64 *defrag_end) { - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; int ret = 1; + bool next_mergeable = true; /* * make sure that once we start defragging an extent, we keep on @@ -829,23 +847,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, *skip = 0; - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); - - if (!em) { - /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); - - if (IS_ERR(em)) - return 0; - } + em = defrag_lookup_extent(inode, start); + if (!em) + return 0; /* this will cover holes, and inline extents */ if (em->block_start >= EXTENT_MAP_LAST_BYTE) { @@ -853,18 +857,15 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, goto out; } - /* If we have nothing to merge with us, just skip. */ - if (check_adjacent_extents(inode, em)) { - ret = 0; - goto out; - } + next_mergeable = defrag_check_next_extent(inode, em); /* - * we hit a real extent, if it is big don't bother defragging it again + * we hit a real extent, if it is big or the next extent is not a + * real extent, don't bother defragging it */ - if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) + if ((*last_len == 0 || *last_len >= thresh) && + (em->len >= thresh || !next_mergeable)) ret = 0; - out: /* * last_len ends up being a counter of how many bytes we've defragged. @@ -1143,8 +1144,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, extent_thresh, - &last_len, &skip, &defrag_end)) { + extent_thresh, &last_len, &skip, + &defrag_end)) { unsigned long next; /* * the should_defrag function tells us how much to skip -- cgit v1.2.3 From 69e380d176e73653e09ce2cf3a5dc09682a69229 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 Jun 2012 17:10:51 +0800 Subject: Btrfs: fix incompat flags setting It's a bug, but it happens to work, as BTRFS_COMPRESS_LZO == 2, which has only one bit set. Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c9664b16cd..9a569aef72e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2119,7 +2119,7 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; /* -- cgit v1.2.3 From bc1782374b128103ae9689e0753e0610f35b6bfd Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:18 -0600 Subject: Btrfs: fix missing inherited flag in rename When we move a file into a directory with compression flag, we need to inherite BTRFS_INODE_COMPRESS and clear BTRFS_INODE_NOCOMPRESS as well. But if we move a file into a directory without compression flag, we need to clear both of them. It is the way how our setflags deals with compression flag, so keep the same behaviour here. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7a090fb4eb9..3f2c8cbe5ba 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7122,10 +7122,13 @@ static void fixup_inode_flags(struct inode *dir, struct inode *inode) else b_inode->flags &= ~BTRFS_INODE_NODATACOW; - if (b_dir->flags & BTRFS_INODE_COMPRESS) + if (b_dir->flags & BTRFS_INODE_COMPRESS) { b_inode->flags |= BTRFS_INODE_COMPRESS; - else - b_inode->flags &= ~BTRFS_INODE_COMPRESS; + b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + } else { + b_inode->flags &= ~(BTRFS_INODE_COMPRESS | + BTRFS_INODE_NOCOMPRESS); + } } static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- cgit v1.2.3 From 4e42ae1bdcda77fc958a17d7ff4ba5a9c9c207da Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:19 -0600 Subject: Btrfs: do not resize a seeding device Seeding devices are not supposed to change any more. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a98f7d25282..58adbd0356d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1306,6 +1306,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = -EINVAL; goto out_free; } + if (device->fs_devices && device->fs_devices->seeding) { + printk(KERN_INFO "btrfs: resizer unable to apply on " + "seeding device %llu\n", devid); + ret = -EINVAL; + goto out_free; + } + if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; else { -- cgit v1.2.3 From 6e841e32b159e298debbb2cb0e9158e894fcaf05 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:20 -0600 Subject: Btrfs: avoid memory leak of extent state in error handling routine We've forgotten to clear extent states in pinned tree, which will results in space counter mismatch and memory leak: WARNING: at fs/btrfs/extent-tree.c:7537 btrfs_free_block_groups+0x1f3/0x2e0 [btrfs]() ... space_info 2 has 8380416 free, is not full space_info total=12582912, used=4096, pinned=4096, reserved=0, may_use=0, readonly=4194304 btrfs state leak: start 29364224 end 29376511 state 1 in tree ffff880075f20090 refs 1 ... Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9a569aef72e..63a36232788 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3601,6 +3601,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); /* memset(cur_trans, 0, sizeof(*cur_trans)); -- cgit v1.2.3 From ed0eaa14981e87a1e185b61e4ef621c440e3930c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:21 -0600 Subject: Btrfs: make sure that we've made everything in pinned tree clean Since we have two trees for recording pinned extents, we need to go through both of them to make sure that we've done everything clean. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 63a36232788..ffdd76bf05d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3557,8 +3557,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, u64 start; u64 end; int ret; + bool loop = true; unpin = pinned_extents; +again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -3576,6 +3578,15 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, cond_resched(); } + if (loop) { + if (unpin == &root->fs_info->freed_extents[0]) + unpin = &root->fs_info->freed_extents[1]; + else + unpin = &root->fs_info->freed_extents[0]; + loop = false; + goto again; + } + return 0; } -- cgit v1.2.3 From 67cde3448d951b55088a6ea3bb1aee0160068fb9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 14 Jun 2012 02:23:22 -0600 Subject: Btrfs: destroy the items of the delayed inodes in error handling routine the items of the delayed inodes were forgotten to be freed, this patch fixes it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 18 ++++++++++++++++++ fs/btrfs/delayed-inode.h | 3 +++ fs/btrfs/disk-io.c | 6 ++++++ 3 files changed, 27 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c18d0442ae6..2399f408691 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1879,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) } } } + +void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node) { + __btrfs_kill_delayed_node(curr_node); + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } +} + diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7083d08b2a2..f5aa4023d3e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_root *root); + /* Used for readdir() */ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, struct list_head *del_list); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ffdd76bf05d..e22c5bbf022 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3608,6 +3608,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->commit_done = 1; wake_up(&cur_trans->commit_wait); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pending_snapshots(cur_trans); btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, @@ -3662,6 +3665,9 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pending_snapshots(t); btrfs_destroy_delalloc_inodes(root); -- cgit v1.2.3 From 4325edd0786fdd3909f9550e52a963b2fe54f78b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Jun 2012 20:02:02 -0400 Subject: Btrfs: init old_generation in get_old_root gcc was giving an uninit variable warning here. Strictly speaking we don't need to init it, but this will make things much less error prone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 04b06bcf2ca..15cbc2bf4ff 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1175,7 +1175,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb; struct tree_mod_root *old_root = NULL; - u64 old_generation; + u64 old_generation = 0; u64 logical; eb = btrfs_read_lock_root_node(root); -- cgit v1.2.3 From a8c4a33b98b097e69cd1a10672c43d17ad0ffb25 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Jun 2012 20:07:17 -0400 Subject: Btrfs: cast devid to unsigned long long for printk %llu Avoid warning in 32 bit machines Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 58adbd0356d..0e92e576300 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1308,7 +1308,8 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } if (device->fs_devices && device->fs_devices->seeding) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", devid); + "seeding device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_free; } -- cgit v1.2.3 From 1c8f52a5e9539600543347bcdefafa1854e07986 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 19 Jun 2012 07:42:25 -0600 Subject: Btrfs: introduce btrfs_next_old_item We introduce btrfs_next_old_item that uses btrfs_next_old_leaf instead of btrfs_next_leaf. btrfs_next_item is also changed to simply call btrfs_next_old_item with time_seq being 0. Signed-off-by: Alexander Block Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index db15e9e7b91..84ac723f58f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2755,13 +2755,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); -static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +static inline int btrfs_next_old_item(struct btrfs_root *root, + struct btrfs_path *p, u64 time_seq) { ++p->slots[0]; if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_leaf(root, p); + return btrfs_next_old_leaf(root, p, time_seq); return 0; } +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + return btrfs_next_old_item(root, p, 0); +} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, -- cgit v1.2.3 From 69bca40d41c613927b150c5392505f1894fe3010 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 19 Jun 2012 07:42:26 -0600 Subject: Btrfs: don't assume to be on the correct extent in add_all_parents add_all_parents did assume that path is already at a correct extent data item, which may not be true in case of data extents that were partly rewritten and splitted. We need to check if we're on a matching extent for every item and only for the ones after the first. The loop is changed to do this now. This patch also fixes a bug introduced with commit 3b127fd8 "Btrfs: remove obsolete btrfs_next_leaf call from __resolve_indirect_ref". The removal of next_leaf did sometimes result in slot==nritems when the above described case happens, and thus resulting in invalid values (e.g. wanted_obejctid) in add_all_parents (leading to missed backrefs or even crashes). Signed-off-by: Alexander Block Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 94 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 42 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8f7d1237b7a..7301cdb4b2c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -179,61 +179,74 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, int level, - struct btrfs_key *key, u64 time_seq, + struct btrfs_key *key_for_search, u64 time_seq, u64 wanted_disk_byte, const u64 *extent_item_pos) { - int ret; - int slot = path->slots[level]; - struct extent_buffer *eb = path->nodes[level]; + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL; u64 disk_byte; - u64 wanted_objectid = key->objectid; -add_parent: - if (level == 0 && extent_item_pos) { - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); if (ret < 0) return ret; - } - ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); - if (ret < 0) - return ret; - - if (level != 0) return 0; + } /* - * if the current leaf is full with EXTENT_DATA items, we must - * check the next one if that holds a reference as well. - * ref->count cannot be used to skip this check. - * repeat this until we don't find any additional EXTENT_DATA items. + * We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot==nritems. In that case, go to the next leaf before we continue. */ - while (1) { - eie = NULL; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - if (ret < 0) - return ret; - if (ret) - return 0; + while (!ret) { eb = path->nodes[0]; - for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, key, slot); - if (key->objectid != wanted_objectid || - key->type != BTRFS_EXTENT_DATA_KEY) - return 0; - fi = btrfs_item_ptr(eb, slot, - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == wanted_disk_byte) - goto add_parent; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + if (extent_item_pos) { + ret = check_extent_in_eb(&key, eb, fi, + *extent_item_pos, + &eie); + if (ret < 0) + break; + } + if (!ret) { + ret = ulist_add(parents, eb->start, + (unsigned long)eie, GFP_NOFS); + if (ret < 0) + break; + if (!extent_item_pos) { + ret = btrfs_next_old_leaf(root, path, + time_seq); + continue; + } + } } + ret = btrfs_next_old_item(root, path, time_seq); } - return 0; + if (ret > 0) + ret = 0; + return ret; } /* @@ -250,7 +263,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct btrfs_root *root; struct btrfs_key root_key; - struct btrfs_key key = {0}; struct extent_buffer *eb; int ret = 0; int root_level; @@ -295,11 +307,9 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - - ret = add_all_parents(root, path, parents, level, &key, time_seq, - ref->wanted_disk_byte, extent_item_pos); + ret = add_all_parents(root, path, parents, level, &ref->key_for_search, + time_seq, ref->wanted_disk_byte, + extent_item_pos); out: btrfs_free_path(path); return ret; -- cgit v1.2.3 From e18fca734278784bd6591de63ca148cc27344ca9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Jun 2012 07:23:18 -0600 Subject: Btrfs: add a missing spin_lock When fixing up the locking in the delayed ref destruction work I accidently broke the locking myself ;(. Add back a spin_lock that should be there and we are now all set. Thanks, Btrfs: add a missing spin_lock When fixing up the locking in the delayed ref destruction work I accidently broke the locking myself ;(. Add back a spin_lock that should be there and we are now all set. Thanks, Reported-by: Dan Carpenter Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e22c5bbf022..7d7bc8eace8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3426,6 +3426,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, mutex_unlock(&head->mutex); btrfs_put_delayed_ref(ref); + spin_lock(&delayed_refs->lock); continue; } -- cgit v1.2.3 From cb77fcd88569cd2b7b25ecd4086ea932a53be9b3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jun 2012 12:19:48 -0600 Subject: Btrfs: delay iput with async extents There is some concern that these iput()'s could be the final iputs and could induce lockups on people waiting on writeback. This would happen in the rare case that we don't create ordered extents because of an error, but it is theoretically possible and we already have a mechanism to deal with this so just make them delayed iputs to negate any worry. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f2c8cbe5ba..4a4f2d59a64 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -987,7 +987,7 @@ static noinline void async_cow_start(struct btrfs_work *work) async_cow->start, async_cow->end, async_cow, &num_added); if (num_added == 0) { - iput(async_cow->inode); + btrfs_add_delayed_iput(async_cow->inode); async_cow->inode = NULL; } } @@ -1023,7 +1023,7 @@ static noinline void async_cow_free(struct btrfs_work *work) struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); if (async_cow->inode) - iput(async_cow->inode); + btrfs_add_delayed_iput(async_cow->inode); kfree(async_cow); } -- cgit v1.2.3 From 8ca78f3eda4bf1799e8c4ba02035623fd7a347df Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 27 Jun 2012 15:05:48 +0200 Subject: Btrfs: avoid waiting for delayed refs when we must not We track two conditions to decide if we should sleep while waiting for more delayed refs, the number of delayed refs (num_refs) and the first entry in the list of blockers (first_seq). When we suspect staleness, we save num_refs and do one more cycle. If nothing changes, we then save first_seq for later comparison and do wait_event. We ought to save first_seq the very same moment we're saving num_refs. Otherwise we cannot be sure that nothing has changed and we might start waiting when we shouldn't, which could lead to starvation. Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4b5a1e1bdef..6e1d36702ff 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2347,12 +2347,10 @@ next: return count; } - static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs) + unsigned long num_refs, + struct list_head *first_seq) { - struct list_head *first_seq = delayed_refs->seq_head.next; - spin_unlock(&delayed_refs->lock); pr_debug("waiting for more refs (num %ld, first %p)\n", num_refs, first_seq); @@ -2381,6 +2379,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct list_head cluster; + struct list_head *first_seq = NULL; int ret; u64 delayed_start; int run_all = count == (unsigned long)-1; @@ -2436,8 +2435,10 @@ again: */ consider_waiting = 1; num_refs = delayed_refs->num_entries; + first_seq = root->fs_info->tree_mod_seq_list.next; } else { - wait_for_more_refs(delayed_refs, num_refs); + wait_for_more_refs(delayed_refs, + num_refs, first_seq); /* * after waiting, things have changed. we * dropped the lock and someone else might have -- cgit v1.2.3 From 9345457f4a539a40056431aeb6f068750857472f Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 27 Jun 2012 15:23:09 +0200 Subject: Btrfs: support root level changes in __resolve_indirect_ref With the tree mod log, we can have a tree that's two levels high, but btrfs_search_old_slot may still return a path with the tree root at level one instead. __resolve_indirect_ref must care for this and accept parents in a lower level than expected. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7301cdb4b2c..cf0df904347 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -301,10 +301,14 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; eb = path->nodes[level]; - if (!eb) { - WARN_ON(1); - ret = 1; - goto out; + while (!eb) { + if (!level) { + WARN_ON(1); + ret = 1; + goto out; + } + level--; + eb = path->nodes[level]; } ret = add_all_parents(root, path, parents, level, &ref->key_for_search, -- cgit v1.2.3 From 28da9fb4467f7a650cd31af6dfad3a4e4a3abf6e Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Jun 2012 10:59:13 +0200 Subject: Btrfs: fix tree mod log for root replacements at leaf level For the tree mod log, we don't log any operations at leaf level. If the root is at the leaf level (i.e. the tree consists only of the root), then __tree_mod_log_oldest_root will find a ROOT_REPLACE operation in the log (because we always log that one no matter which level), but no other operations. With this patch __tree_mod_log_oldest_root exits cleanly instead of BUGging in this situation. get_old_root checks if its really a root at leaf level in case we don't have any operations and WARNs if this assumption breaks. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 15cbc2bf4ff..7d1e4fc5fb6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1024,11 +1024,18 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, if (!looped && !tm) return 0; /* - * we must have key remove operations in the log before the - * replace operation. + * if there are no tree operation for the oldest root, we simply + * return it. this should only happen if that (old) root is at + * level 0. */ - BUG_ON(!tm); + if (!tm) + break; + /* + * if there's an operation that's not a root replacement, we + * found the oldest version of our root. normally, we'll find a + * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ if (tm->op != MOD_LOG_ROOT_REPLACE) break; @@ -1192,16 +1199,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } tm = tree_mod_log_search(root->fs_info, logical, time_seq); - /* - * there was an item in the log when __tree_mod_log_oldest_root - * returned. this one must not go away, because the time_seq passed to - * us must be blocking its removal. - */ - BUG_ON(!tm); - if (old_root) - eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, - root->nodesize); + eb = alloc_dummy_extent_buffer(logical, root->nodesize); else eb = btrfs_clone_extent_buffer(root->node); btrfs_tree_read_unlock(root->node); @@ -1216,7 +1215,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } - __tree_mod_log_rewind(eb, time_seq, tm); + if (tm) + __tree_mod_log_rewind(eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); extent_buffer_get(eb); return eb; -- cgit v1.2.3 From c3e0696523862c48b4d8c73ffb2867e9db478338 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Jun 2012 11:01:06 +0200 Subject: Btrfs: always put insert_ptr modifications into the tree mod log Several callers of insert_ptr set the tree_mod_log parameter to 0 to avoid addition to the tree mod log. In fact, we need all of those operations. This commit simply removes the additional parameter and makes addition to the tree mod log unconditional. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7d1e4fc5fb6..e005d9b0461 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2997,7 +2997,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, static void insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, - int slot, int level, int tree_mod_log) + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -3010,7 +3010,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(slot > nritems); BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { - if (tree_mod_log && level) + if (level) tree_mod_log_eb_move(root->fs_info, lower, slot + 1, slot, nritems - slot); memmove_extent_buffer(lower, @@ -3018,7 +3018,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } - if (tree_mod_log && level) { + if (level) { ret = tree_mod_log_insert_key(root->fs_info, lower, slot, MOD_LOG_KEY_ADD); BUG_ON(ret < 0); @@ -3106,7 +3106,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(split); insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1, 1); + path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -3643,7 +3643,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1, 0); + path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3850,7 +3850,7 @@ again: if (mid <= slot) { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1, 0); + path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3859,7 +3859,7 @@ again: } else { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1, 0); + path->slots[1], 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; -- cgit v1.2.3 From 155725c9c051a343be5e555bf943da827e6cf721 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 22 Jun 2012 14:01:00 +0200 Subject: Btrfs: leave critical region in btrfs_find_all_roots as soon as possible When delayed refs exist, btrfs_find_all_roots used to hold the delayed ref mutex way longer than actually required. We ought to drop it immediately after we're done collecting all the delayed refs. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index cf0df904347..a383c18e74e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -839,6 +839,7 @@ again: } ret = __add_delayed_refs(head, delayed_ref_seq, &prefs_delayed); + mutex_unlock(&head->mutex); if (ret) { spin_unlock(&delayed_refs->lock); goto out; @@ -932,8 +933,6 @@ again: } out: - if (head) - mutex_unlock(&head->mutex); btrfs_free_path(path); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); -- cgit v1.2.3 From 19956c7e94a7a58d6df8c4db5ae62f9109a7c663 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 22 Jun 2012 14:52:13 +0200 Subject: Btrfs: fix tree mod log rewind of ADD operations When a MOD_LOG_KEY_ADD operation is rewinded, we remove the key from the tree block. If its not the last key, removal involves a move operation. This move operation was explicitly done before this commit. However, at insertion time, there's a move operation before the actual addition to make room for the new key, which is recorded in the tree mod log as well. This means, we must drop the move operation when rewinding the add operation, because the next operation we'll be rewinding will be the corresponding MOD_LOG_MOVE_KEYS operation. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e005d9b0461..b98f8604f4f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1094,11 +1094,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, tm->generation); break; case MOD_LOG_KEY_ADD: - if (tm->slot != n - 1) { - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->slot + 1); - memmove_extent_buffer(eb, o_dst, o_src, p_size); - } + /* if a move operation is needed it's in the log */ n--; break; case MOD_LOG_MOVE_KEYS: -- cgit v1.2.3 From d42244a0d36ad0939c5f173ebf15841a0678899c Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 22 Jun 2012 14:51:15 +0200 Subject: Btrfs: resolve tree mod log locking issue in btrfs_next_leaf With the tree mod log, we may end up with two roots (the current root and a rewinded version of it) both pointing to two leaves, l1 and l2, of which l2 had already been cow-ed in the current transaction. If we don't rewind any tree blocks, we cannot have two roots both pointing to an already cowed tree block. Now there is btrfs_next_leaf, which has a leaf locked and wants a lock on the next (right) leaf. And there is push_leaf_left, which has a (cowed!) leaf locked and wants a lock on the previous (left) leaf. In order to solve this dead lock situation, we use try_lock in btrfs_next_leaf (only in case it's called with a tree mod log time_seq paramter) and if we fail to get a lock on the next leaf, we give up our lock on the current leaf and retry from the very beginning. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b98f8604f4f..8206b390058 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5119,6 +5119,18 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && time_seq) { + /* + * If we don't get the lock, we may be racing + * with push_leaf_left, holding that lock while + * itself waiting for the leaf we've currently + * locked. To solve this situation, we give up + * on our lock and cycle. + */ + btrfs_release_path(path); + cond_resched(); + goto again; + } if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); -- cgit v1.2.3 From 597a60fadedf9a40fdff8735054bf772b3dafd57 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 14 Jun 2012 16:42:31 +0200 Subject: Btrfs: don't count I/O statistic read errors for missing devices It is normal behaviour of the low level btrfs function btrfs_map_bio() to complete a bio with -EIO if the device is missing, instead of just preventing the bio creation in an earlier step. This used to cause I/O statistic read error increments and annoying printk_ratelimited messages. This commit fixes the issue. Signed-off-by: Stefan Behrens Reported-by: Carey Underwood --- fs/btrfs/volumes.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8a3d2594b80..3f292cf693a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4061,16 +4061,18 @@ static void btrfs_end_bio(struct bio *bio, int err) BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; - if (bio->bi_rw & WRITE) - btrfs_dev_stat_inc(dev, - BTRFS_DEV_STAT_WRITE_ERRS); - else - btrfs_dev_stat_inc(dev, - BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) - btrfs_dev_stat_inc(dev, - BTRFS_DEV_STAT_FLUSH_ERRS); - btrfs_dev_stat_print_on_error(dev); + if (dev->bdev) { + if (bio->bi_rw & WRITE) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_WRITE_ERRS); + else + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_READ_ERRS); + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_print_on_error(dev); + } } } -- cgit v1.2.3 From c3473e830074ef04f974f2829690942dd8580619 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Jun 2012 10:59:00 -0400 Subject: Btrfs: fix dio write vs buffered read race Miao pointed out there's a problem with mixing dio writes and buffered reads. If the read happens between us invalidating the page range and actually locking the extent we can bring in pages into page cache. Then once the write finishes if somebody tries to read again it will just find uptodate pages and we'll read stale data. So we need to lock the extent and check for uptodate bits in the range. If there are uptodate bits we need to unlock and invalidate again. This will keep this race from happening since we will hold the extent locked until we create the ordered extent, and then teh read side always waits for ordered extents. There was also a race in how we updated i_size, previously we were relying on the generic DIO stuff to adjust the i_size after the DIO had completed, but this happens outside of the extent lock which means reads could come in and not see the updated i_size. So instead move this work into where we create the extents, and then this way the update ordered i_size stuff works properly in the endio handlers. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 13 ------------- fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 876cddd6b2f..248d2026524 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1334,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t *ppos, size_t count, size_t ocount) { struct file *file = iocb->ki_filp; - struct inode *inode = fdentry(file)->d_inode; struct iov_iter i; ssize_t written; ssize_t written_buffered; @@ -1344,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, count, ocount); - /* - * the generic O_DIRECT will update in-memory i_size after the - * DIOs are done. But our endio handlers that update the on - * disk i_size never update past the in memory i_size. So we - * need one more update here to catch any additions to the - * file - */ - if (inode->i_size != BTRFS_I(inode)->disk_i_size) { - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - mark_inode_dirty(inode); - } - if (written < 0 || written == count) return written; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a4f2d59a64..6971fb5fc85 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5904,8 +5904,17 @@ map: bh_result->b_size = len; bh_result->b_bdev = em->bdev; set_buffer_mapped(bh_result); - if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); + if (create) { + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (start + len > i_size_read(inode)) + i_size_write(inode, start + len); + } free_extent_map(em); @@ -6388,12 +6397,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, */ ordered = btrfs_lookup_ordered_range(inode, lockstart, lockend - lockstart + 1); - if (!ordered) + + /* + * We need to make sure there are no buffered pages in this + * range either, we could have raced between the invalidate in + * generic_file_direct_write and locking the extent. The + * invalidate needs to happen so that reads after a write do not + * get stale data. + */ + if (!ordered && (!writing || + !test_range_bit(&BTRFS_I(inode)->io_tree, + lockstart, lockend, EXTENT_UPTODATE, 0, + cached_state))) break; + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); + + if (ordered) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } else { + /* Screw you mmap */ + ret = filemap_write_and_wait_range(file->f_mapping, + lockstart, + lockend); + if (ret) + goto out; + + /* + * If we found a page that couldn't be invalidated just + * fall back to buffered. + */ + ret = invalidate_inode_pages2_range(file->f_mapping, + lockstart >> PAGE_CACHE_SHIFT, + lockend >> PAGE_CACHE_SHIFT); + if (ret) { + if (ret == -EBUSY) + ret = 0; + goto out; + } + } + cond_resched(); } -- cgit v1.2.3 From 68310a5e42f93c2242ec1836c3b18d531e0065e2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 Jun 2012 12:24:12 -0600 Subject: Btrfs: restore restriper state on all mounts Fix a bug that triggered asserts in btrfs_balance() in both normal and resume modes -- restriper state was not properly restored on read-only mounts. This factors out resuming code from btrfs_restore_balance(), which is now also called earlier in the mount sequence to avoid the problem of some early writes getting the old profile. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 10 ++++++---- fs/btrfs/volumes.c | 39 +++++++++++++++++++-------------------- fs/btrfs/volumes.h | 2 +- 3 files changed, 26 insertions(+), 25 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d7bc8eace8..3a7961ba161 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2354,12 +2354,17 @@ retry_root_backup: BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) goto recovery_tree_root; - csum_root->track_dirty = 1; fs_info->generation = generation; fs_info->last_trans_committed = generation; + ret = btrfs_recover_balance(fs_info); + if (ret) { + printk(KERN_WARNING "btrfs: failed to recover balance\n"); + goto fail_block_groups; + } + ret = btrfs_init_dev_stats(fs_info); if (ret) { printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", @@ -2492,9 +2497,6 @@ retry_root_backup: err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); - if (!err) - err = btrfs_recover_balance(fs_info->tree_root); - if (err) { close_ctree(tree_root); return err; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f292cf693a..48943d0f861 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2867,9 +2867,8 @@ static int balance_kthread(void *data) return ret; } -int btrfs_recover_balance(struct btrfs_root *tree_root) +int btrfs_recover_balance(struct btrfs_fs_info *fs_info) { - struct task_struct *tsk; struct btrfs_balance_control *bctl; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; @@ -2882,29 +2881,30 @@ int btrfs_recover_balance(struct btrfs_root *tree_root) if (!path) return -ENOMEM; - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out; - } - key.objectid = BTRFS_BALANCE_OBJECTID; key.type = BTRFS_BALANCE_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out_bctl; + goto out; if (ret > 0) { /* ret = -ENOENT; */ ret = 0; - goto out_bctl; + goto out; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - bctl->fs_info = tree_root->fs_info; - bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; + bctl->fs_info = fs_info; + bctl->flags = btrfs_balance_flags(leaf, item); + bctl->flags |= BTRFS_BALANCE_RESUME; btrfs_balance_data(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); @@ -2913,14 +2913,13 @@ int btrfs_recover_balance(struct btrfs_root *tree_root) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); - if (IS_ERR(tsk)) - ret = PTR_ERR(tsk); - else - goto out; + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); -out_bctl: - kfree(bctl); + set_balance_control(bctl); + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 74366f27a76..e1b1a649fc5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -281,7 +281,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); -int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -- cgit v1.2.3 From 2b6ba629b5aac51e7099efbb43e2b403213aa7fb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 Jun 2012 12:24:13 -0600 Subject: Btrfs: resume balance on rw (re)mounts properly This introduces btrfs_resume_balance_async(), which, given that restriper state was recovered earlier by btrfs_recover_balance(), resumes balance in btrfs-balance kthread. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 24 +++++++++++++++--------- fs/btrfs/super.c | 4 ++++ fs/btrfs/volumes.c | 36 +++++++++++++++++++++++++++--------- fs/btrfs/volumes.h | 1 + 4 files changed, 47 insertions(+), 18 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3a7961ba161..8cc47103a32 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2490,17 +2490,23 @@ retry_root_backup: goto fail_trans_kthread; } - if (!(sb->s_flags & MS_RDONLY)) { - down_read(&fs_info->cleanup_work_sem); - err = btrfs_orphan_cleanup(fs_info->fs_root); - if (!err) - err = btrfs_orphan_cleanup(fs_info->tree_root); + if (sb->s_flags & MS_RDONLY) + return 0; + + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); + close_ctree(tree_root); + return ret; + } + up_read(&fs_info->cleanup_work_sem); - if (err) { - close_ctree(tree_root); - return err; - } + ret = btrfs_resume_balance_async(fs_info); + if (ret) { + printk(KERN_WARNING "btrfs: failed to resume balance\n"); + close_ctree(tree_root); + return ret; } return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0eb9a4da069..e23991574fd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1187,6 +1187,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto restore; + sb->s_flags &= ~MS_RDONLY; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 48943d0f861..ecaad40e7ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2845,28 +2845,46 @@ out: static int balance_kthread(void *data) { - struct btrfs_balance_control *bctl = - (struct btrfs_balance_control *)data; - struct btrfs_fs_info *fs_info = bctl->fs_info; + struct btrfs_fs_info *fs_info = data; int ret = 0; mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - set_balance_control(bctl); - - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { - printk(KERN_INFO "btrfs: force skipping balance\n"); - } else { + if (fs_info->balance_ctl) { printk(KERN_INFO "btrfs: continuing balance\n"); - ret = btrfs_balance(bctl, NULL); + ret = btrfs_balance(fs_info->balance_ctl, NULL); } mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + return ret; } +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *tsk; + + spin_lock(&fs_info->balance_lock); + if (!fs_info->balance_ctl) { + spin_unlock(&fs_info->balance_lock); + return 0; + } + spin_unlock(&fs_info->balance_lock); + + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + printk(KERN_INFO "btrfs: force skipping balance\n"); + return 0; + } + + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); + if (IS_ERR(tsk)) + return PTR_ERR(tsk); + + return 0; +} + int btrfs_recover_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e1b1a649fc5..95f6637614d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -281,6 +281,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -- cgit v1.2.3 From d3a94048c912e18e99a091d5ea2d0a1178152d6f Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Mon, 25 Jun 2012 15:36:12 -0600 Subject: Btrfs: use _IOR for BTRFS_IOC_SUBVOL_GETFLAGS We used the wrong ioctl macro for the getflags ioctl before. As we don't have the set/getflags ioctls in the user space ioctl.h at the moment, it's safe to fix it now. Reviewed-by: David Sterba Signed-off-by: Alexander Block --- fs/btrfs/ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 497c530724c..e440aa653c3 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -339,7 +339,7 @@ struct btrfs_ioctl_get_dev_stats { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ struct btrfs_ioctl_scrub_args) -- cgit v1.2.3 From 6bf02314d9a5c29f6ec30285b9ad5361c2d4c85a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 25 Jun 2012 21:59:09 -0600 Subject: Btrfs: fix wrong check during log recovery When we're evicting an inode during log recovery, we need to ensure that the inode is not in orphan state any more, which means inode's run_time flags has _no_ BTRFS_INODE_HAS_ORPHAN_ITEM. Thus, the BUG_ON was triggered because of a wrong check for the flags. Reviewed-by: David Sterba Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6971fb5fc85..9f07bd121f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3754,7 +3754,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { - BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)); goto no_delete; } -- cgit v1.2.3 From bdb7d303b33c1648514c9f9461d7513a4c05ce48 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 27 Jun 2012 15:10:56 -0400 Subject: Btrfs: fix tree log remove space corner case The tree log stuff can have allocated space that we end up having split across a bitmap and a real extent. The free space code does not deal with this, it assumes that if it finds an extent or bitmap entry that the entire range must fall within the entry it finds. This isn't necessarily the case, so rework the remove function so it can handle this case properly. This fixed two panics the user hit, first in the case where the space was initially in a bitmap and then in an extent entry, and then the reverse case. Thanks, Reported-and-tested-by: Shaun Reich Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 145 ++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 93 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 19a0d85b451..a70c54e2e1b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1542,29 +1542,26 @@ again: end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; /* - * XXX - this can go away after a few releases. - * - * since the only user of btrfs_remove_free_space is the tree logging - * stuff, and the only way to test that is under crash conditions, we - * want to have this debug stuff here just in case somethings not - * working. Search the bitmap for the space we are trying to use to - * make sure its actually there. If its not there then we need to stop - * because something has gone wrong. + * We need to search for bits in this bitmap. We could only cover some + * of the extent in this bitmap thanks to how we add space, so we need + * to search for as much as it as we can and clear that amount, and then + * go searching for the next bit. */ search_start = *offset; - search_bytes = *bytes; + search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); BUG_ON(ret < 0 || search_start != *offset); - if (*offset > bitmap_info->offset && *offset + *bytes > end) { - bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1); - *bytes -= end - *offset + 1; - *offset = end + 1; - } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { - bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes); - *bytes = 0; - } + /* We may have found more bits than what we need */ + search_bytes = min(search_bytes, *bytes); + + /* Cannot clear past the end of the bitmap */ + search_bytes = min(search_bytes, end - search_start + 1); + + bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); + *offset += search_bytes; + *bytes -= search_bytes; if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); @@ -1595,7 +1592,7 @@ again: * everything over again. */ search_start = *offset; - search_bytes = *bytes; + search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); if (ret < 0 || search_start != *offset) @@ -1878,12 +1875,14 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - struct btrfs_free_space *next_info = NULL; int ret = 0; spin_lock(&ctl->tree_lock); again: + if (!bytes) + goto out_lock; + info = tree_search_offset(ctl, offset, 0, 0); if (!info) { /* @@ -1904,88 +1903,48 @@ again: } } - if (info->bytes < bytes && rb_next(&info->offset_index)) { - u64 end; - next_info = rb_entry(rb_next(&info->offset_index), - struct btrfs_free_space, - offset_index); - - if (next_info->bitmap) - end = next_info->offset + - BITS_PER_BITMAP * ctl->unit - 1; - else - end = next_info->offset + next_info->bytes; - - if (next_info->bytes < bytes || - next_info->offset > offset || offset > end) { - printk(KERN_CRIT "Found free space at %llu, size %llu," - " trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); - WARN_ON(1); - ret = -EINVAL; - goto out_lock; - } - - info = next_info; - } - - if (info->bytes == bytes) { + if (!info->bitmap) { unlink_free_space(ctl, info); - if (info->bitmap) { - kfree(info->bitmap); - ctl->total_bitmaps--; - } - kmem_cache_free(btrfs_free_space_cachep, info); - ret = 0; - goto out_lock; - } - - if (!info->bitmap && info->offset == offset) { - unlink_free_space(ctl, info); - info->offset += bytes; - info->bytes -= bytes; - ret = link_free_space(ctl, info); - WARN_ON(ret); - goto out_lock; - } + if (offset == info->offset) { + u64 to_free = min(bytes, info->bytes); + + info->bytes -= to_free; + info->offset += to_free; + if (info->bytes) { + ret = link_free_space(ctl, info); + WARN_ON(ret); + } else { + kmem_cache_free(btrfs_free_space_cachep, info); + } - if (!info->bitmap && info->offset <= offset && - info->offset + info->bytes >= offset + bytes) { - u64 old_start = info->offset; - /* - * we're freeing space in the middle of the info, - * this can happen during tree log replay - * - * first unlink the old info and then - * insert it again after the hole we're creating - */ - unlink_free_space(ctl, info); - if (offset + bytes < info->offset + info->bytes) { - u64 old_end = info->offset + info->bytes; + offset += to_free; + bytes -= to_free; + goto again; + } else { + u64 old_end = info->bytes + info->offset; - info->offset = offset + bytes; - info->bytes = old_end - info->offset; + info->bytes = offset - info->offset; ret = link_free_space(ctl, info); WARN_ON(ret); if (ret) goto out_lock; - } else { - /* the hole we're creating ends at the end - * of the info struct, just free the info - */ - kmem_cache_free(btrfs_free_space_cachep, info); - } - spin_unlock(&ctl->tree_lock); - /* step two, insert a new info struct to cover - * anything before the hole - */ - ret = btrfs_add_free_space(block_group, old_start, - offset - old_start); - WARN_ON(ret); /* -ENOMEM */ - goto out; + /* Not enough bytes in this entry to satisfy us */ + if (old_end < offset + bytes) { + bytes -= old_end - offset; + offset = old_end; + goto again; + } else if (old_end == offset + bytes) { + /* all done */ + goto out_lock; + } + spin_unlock(&ctl->tree_lock); + + ret = btrfs_add_free_space(block_group, offset + bytes, + old_end - (offset + bytes)); + WARN_ON(ret); + goto out; + } } ret = remove_from_bitmap(ctl, info, &offset, &bytes); -- cgit v1.2.3 From 7fd1a3f73f3743b4ffd520effe288a70b0ec47c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 27 Jun 2012 17:18:41 -0400 Subject: Btrfs: hold a ref on the inode during writepages We can race with unlink and not actually be able to do our igrab in btrfs_add_ordered_extent. This will result in all sorts of problems. Instead of doing the complicated work to try and handle returning an error properly from btrfs_add_ordered_extent, just hold a ref to the inode during writepages. If we cannot grab a ref we know we're freeing this inode anyway and can just drop the dirty pages on the floor, because screw them we're going to invalidate them anyway. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aaa12c1eb34..01c21b6c6d4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3324,6 +3324,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, writepage_t writepage, void *data, void (*flush_fn)(void *)) { + struct inode *inode = mapping->host; int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -3334,6 +3335,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, int scanned = 0; int tag; + /* + * We have to hold onto the inode so that ordered extents can do their + * work when the IO finishes. The alternative to this is failing to add + * an ordered extent if the igrab() fails there and that is a huge pain + * to deal with, so instead just hold onto the inode throughout the + * writepages operation. If it fails here we are freeing up the inode + * anyway and we'd rather not waste our time writing out stuff that is + * going to be truncated anyway. + */ + if (!igrab(inode)) + return 0; + pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ @@ -3428,6 +3441,7 @@ retry: index = 0; goto retry; } + btrfs_add_delayed_iput(inode); return ret; } -- cgit v1.2.3 From b6305567e7d31b0bec1b8cb9ec0cadd7f7086f5f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 2 Jul 2012 15:29:53 -0400 Subject: Btrfs: run delayed directory updates during log replay While we are resolving directory modifications in the tree log, we are triggering delayed metadata updates to the filesystem btrees. This commit forces the delayed updates to run so the replay code can find any modifications done. It stops us from crashing because the directory deleltion replay expects items to be removed immediately from the tree. Signed-off-by: Chris Mason cc: stable@kernel.org --- fs/btrfs/tree-log.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2017d0ff511..8abeae4224f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -690,6 +690,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, kfree(name); iput(inode); + + btrfs_run_delayed_items(trans, root); return ret; } @@ -895,6 +897,7 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); + btrfs_run_delayed_items(trans, root); } kfree(victim_name); ptr = (unsigned long)(victim_ref + 1) + victim_name_len; @@ -1475,6 +1478,9 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); BUG_ON(ret); + + btrfs_run_delayed_items(trans, root); + kfree(name); iput(inode); -- cgit v1.2.3