diff options
Diffstat (limited to 'fs')
839 files changed, 83282 insertions, 30181 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 9a1d4263075..15b67916620 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -37,7 +37,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) return ERR_PTR(-ENOMEM); size = v9fs_fid_xattr_get(fid, name, value, size); if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) goto err_out; } @@ -131,7 +131,7 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) buffer = kmalloc(size, GFP_KERNEL); if (!buffer) return -ENOMEM; - retval = posix_acl_to_xattr(acl, buffer, size); + retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); if (retval < 0) goto err_free_out; switch (type) { @@ -251,7 +251,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); + error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return error; @@ -304,7 +304,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, return -EPERM; if (value) { /* update the cached acl value */ - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index b85efa77394..d934f04e773 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -184,10 +184,20 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->afid = option; break; case Opt_uname: - match_strlcpy(v9ses->uname, &args[0], PATH_MAX); + kfree(v9ses->uname); + v9ses->uname = match_strdup(&args[0]); + if (!v9ses->uname) { + ret = -ENOMEM; + goto free_and_return; + } break; case Opt_remotename: - match_strlcpy(v9ses->aname, &args[0], PATH_MAX); + kfree(v9ses->aname); + v9ses->aname = match_strdup(&args[0]); + if (!v9ses->aname) { + ret = -ENOMEM; + goto free_and_return; + } break; case Opt_nodevmap: v9ses->nodev = 1; @@ -287,21 +297,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, struct p9_fid *fid; int rc; - v9ses->uname = __getname(); + v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!v9ses->uname) return ERR_PTR(-ENOMEM); - v9ses->aname = __getname(); + v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); if (!v9ses->aname) { - __putname(v9ses->uname); + kfree(v9ses->uname); return ERR_PTR(-ENOMEM); } init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); if (rc) { - __putname(v9ses->aname); - __putname(v9ses->uname); + kfree(v9ses->aname); + kfree(v9ses->uname); return ERR_PTR(rc); } @@ -309,8 +319,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - strcpy(v9ses->uname, V9FS_DEFUSER); - strcpy(v9ses->aname, V9FS_DEFANAME); v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; @@ -412,8 +420,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) kfree(v9ses->cachetag); } #endif - __putname(v9ses->uname); - __putname(v9ses->aname); + kfree(v9ses->uname); + kfree(v9ses->aname); bdi_destroy(&v9ses->bdi); @@ -560,6 +568,11 @@ static int v9fs_init_inode_cache(void) */ static void v9fs_destroy_inode_cache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(v9fs_inode_cache); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index e78956cbd70..34c59f14a1c 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -144,7 +144,7 @@ extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nameidata); + unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index d529437ff44..64600b5d052 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -100,13 +100,13 @@ static void v9fs_dentry_release(struct dentry *dentry) } } -static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct p9_fid *fid; struct inode *inode; struct v9fs_inode *v9inode; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fc06fd27065..c2483e97bee 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", page, (unsigned long)filp->private_data); + /* Update file times before taking page lock */ + file_update_time(filp); + v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ v9fs_fscache_wait_on_page_write(inode, page); @@ -735,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 57ccb7537da..890bed538f9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -712,88 +712,34 @@ error: } /** - * v9fs_vfs_create - VFS hook to create files + * v9fs_vfs_create - VFS hook to create a regular file + * + * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called + * for mknod(2). + * * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @mode: create permissions - * @nd: path information * */ static int v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { - int err; - u32 perm; - int flags; - struct file *filp; - struct v9fs_inode *v9inode; - struct v9fs_session_info *v9ses; - struct p9_fid *fid, *inode_fid; - - err = 0; - fid = NULL; - v9ses = v9fs_inode2v9ses(dir); - perm = unixmode2p9mode(v9ses, mode); - if (nd) - flags = nd->intent.open.flags; - else - flags = O_RDWR; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); + u32 perm = unixmode2p9mode(v9ses, mode); + struct p9_fid *fid; - fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, - v9fs_proto_dotu(v9ses))); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - fid = NULL; - goto error; - } + /* P9_OEXCL? */ + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_ORDWR); + if (IS_ERR(fid)) + return PTR_ERR(fid); v9fs_invalidate_inode_attr(dir); - /* if we are opening a file, assign the open fid to the file */ - if (nd) { - v9inode = V9FS_I(dentry->d_inode); - mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && - ((flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - inode_fid = v9fs_writeback_fid(dentry); - if (IS_ERR(inode_fid)) { - err = PTR_ERR(inode_fid); - mutex_unlock(&v9inode->v_mutex); - goto error; - } - v9inode->writeback_fid = (void *) inode_fid; - } - mutex_unlock(&v9inode->v_mutex); - filp = lookup_instantiate_filp(nd, dentry, generic_file_open); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - goto error; - } - - filp->private_data = fid; -#ifdef CONFIG_9P_FSCACHE - if (v9ses->cache) - v9fs_cache_inode_set_cookie(dentry->d_inode, filp); -#endif - } else - p9_client_clunk(fid); + p9_client_clunk(fid); return 0; - -error: - if (fid) - p9_client_clunk(fid); - - return err; } /** @@ -839,7 +785,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode */ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nameidata) + unsigned int flags) { struct dentry *res; struct super_block *sb; @@ -849,8 +795,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, char *name; int result = 0; - p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_name.name, dentry, nameidata); + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n", + dir, dentry->d_name.name, dentry, flags); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -910,6 +856,86 @@ error: return ERR_PTR(result); } +static int +v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) +{ + int err; + u32 perm; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *inode_fid; + struct dentry *res = NULL; + + if (d_unhashed(dentry)) { + res = v9fs_vfs_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; + } + + /* Only creates */ + if (!(flags & O_CREAT) || dentry->d_inode) + return finish_no_open(file, res); + + err = 0; + fid = NULL; + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode); + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + v9fs_invalidate_inode_attr(dir); + v9inode = V9FS_I(dentry->d_inode); + mutex_lock(&v9inode->v_mutex); + if (v9ses->cache && !v9inode->writeback_fid && + ((flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + mutex_unlock(&v9inode->v_mutex); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } + mutex_unlock(&v9inode->v_mutex); + err = finish_open(file, dentry, generic_file_open, opened); + if (err) + goto error; + + file->private_data = fid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(dentry->d_inode, file); +#endif + + *opened |= FILE_CREATED; +out: + dput(res); + return err; + +error: + if (fid) + p9_client_clunk(fid); + goto out; +} + /** * v9fs_vfs_unlink - VFS unlink hook to delete an inode * @i: inode that is being unlinked @@ -1250,12 +1276,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) } /* copy extension buffer into buffer */ - strncpy(buffer, st->extension, buflen); + retval = min(strlen(st->extension)+1, (size_t)buflen); + memcpy(buffer, st->extension, retval); - p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n", - dentry->d_name.name, st->extension, buffer); + p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n", + dentry->d_name.name, st->extension, buflen, buffer); - retval = strnlen(buffer, buflen); done: p9stat_free(st); kfree(st); @@ -1488,6 +1514,7 @@ out: static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, + .atomic_open = v9fs_vfs_atomic_open, .symlink = v9fs_vfs_symlink, .link = v9fs_vfs_link, .unlink = v9fs_vfs_unlink, @@ -1502,6 +1529,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { static const struct inode_operations v9fs_dir_inode_operations = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, + .atomic_open = v9fs_vfs_atomic_open, .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e3dd2a1e2bf..40895546e10 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -230,20 +230,25 @@ int v9fs_open_to_dotl_flags(int flags) * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @mode: create permissions - * @nd: path information * */ static int v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - struct nameidata *nd) + bool excl) +{ + return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); +} + +static int +v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t omode, + int *opened) { int err = 0; gid_t gid; - int flags; umode_t mode; char *name = NULL; - struct file *filp; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; @@ -251,19 +256,23 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct p9_fid *dfid, *ofid, *inode_fid; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; + struct dentry *res = NULL; - v9ses = v9fs_inode2v9ses(dir); - if (nd) - flags = nd->intent.open.flags; - else { - /* - * create call without LOOKUP_OPEN is due - * to mknod of regular files. So use mknod - * operation. - */ - return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + if (d_unhashed(dentry)) { + res = v9fs_vfs_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; } + /* Only creates */ + if (!(flags & O_CREAT) || dentry->d_inode) + return finish_no_open(file, res); + + v9ses = v9fs_inode2v9ses(dir); + name = (char *) dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", name, flags, omode); @@ -272,7 +281,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - return err; + goto out; } /* clone a fid to use for creation */ @@ -280,7 +289,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - return err; + goto out; } gid = v9fs_get_fsgid_for_create(dir); @@ -345,17 +354,18 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, } mutex_unlock(&v9inode->v_mutex); /* Since we are opening a file, assign the open fid to the file */ - filp = lookup_instantiate_filp(nd, dentry, generic_file_open); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); + err = finish_open(file, dentry, generic_file_open, opened); + if (err) goto err_clunk_old_fid; - } - filp->private_data = ofid; + file->private_data = ofid; #ifdef CONFIG_9P_FSCACHE if (v9ses->cache) - v9fs_cache_inode_set_cookie(inode, filp); + v9fs_cache_inode_set_cookie(inode, file); #endif - return 0; + *opened |= FILE_CREATED; +out: + dput(res); + return err; error: if (fid) @@ -364,7 +374,7 @@ err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); v9fs_set_create_acl(NULL, &dacl, &pacl); - return err; + goto out; } /** @@ -982,6 +992,7 @@ out: const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, + .atomic_open = v9fs_vfs_atomic_open_dotl, .lookup = v9fs_vfs_lookup, .link = v9fs_vfs_link_dotl, .symlink = v9fs_vfs_symlink_dotl, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 8c92a9ba833..137d5039689 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -89,7 +89,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; - sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; if (!v9ses->cache) sb->s_flags |= MS_SYNCHRONOUS; @@ -137,7 +137,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto close_session; } - sb = sget(fs_type, NULL, v9fs_set_super, v9ses); + sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); goto clunk_fid; diff --git a/fs/Kconfig b/fs/Kconfig index f95ae3a027f..780725a463b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -28,8 +28,8 @@ config FS_MBCACHE tristate default y if EXT2_FS=y && EXT2_FS_XATTR default y if EXT3_FS=y && EXT3_FS_XATTR - default y if EXT4_FS=y && EXT4_FS_XATTR - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + default y if EXT4_FS=y + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" @@ -68,16 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userspace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL @@ -220,6 +210,7 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" +source "fs/f2fs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 02257420274..0efd1524b97 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -164,3 +164,11 @@ config BINFMT_MISC You may say M here for module support and later load the module when you have use for it; the module is called binfmt_misc. If you don't know what to answer at this point, say Y. + +config COREDUMP + bool "Enable core dump support" if EXPERT + default y + help + This option enables support for performing core dumps. You almost + certainly want to say Y here. Not necessary on systems that never + need debugging or only ever run flawless code. diff --git a/fs/Makefile b/fs/Makefile index 2fb97793467..9d53192236f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_FHANDLE) += fhandle.o @@ -122,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ +obj-$(CONFIG_F2FS_FS) += f2fs/ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 718ac1f440c..585adafb0cc 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -46,8 +46,8 @@ struct adfs_sb_info { struct adfs_discmap *s_map; /* bh list containing map */ struct adfs_dir_ops *s_dir; /* directory operations */ - uid_t s_uid; /* owner uid */ - gid_t s_gid; /* owner gid */ + kuid_t s_uid; /* owner uid */ + kgid_t s_gid; /* owner gid */ umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ umode_t s_other_mask; /* ADFS other perm -> unix perm */ int s_ftsuffix; /* ,xyz hex filetype suffix option */ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 3d83075aaa2..b3be2e7c564 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -266,7 +266,7 @@ const struct dentry_operations adfs_dentry_operations = { }; static struct dentry * -adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct object_info obj; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 1dab6a174d6..5f95d1ed9c6 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, adfs_get_block); } +static void adfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, to, inode->i_size); +} + static int adfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + adfs_write_failed(mapping, pos + len); return ret; } @@ -304,8 +309,8 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) * we can't change the UID or GID of any file - * we have a global UID/GID in the superblock */ - if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid)) + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid))) error = -EPERM; if (error) diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 06fdcc9382c..d5712293579 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> +#include <linux/user_namespace.h> #include "adfs.h" #include "dir_f.h" #include "dir_fplus.h" @@ -130,10 +131,10 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root) { struct adfs_sb_info *asb = ADFS_SB(root->d_sb); - if (asb->s_uid != 0) - seq_printf(seq, ",uid=%u", asb->s_uid); - if (asb->s_gid != 0) - seq_printf(seq, ",gid=%u", asb->s_gid); + if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid)); + if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid)); if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK) seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) @@ -175,12 +176,16 @@ static int parse_options(struct super_block *sb, char *options) case Opt_uid: if (match_int(args, &option)) return -EINVAL; - asb->s_uid = option; + asb->s_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(asb->s_uid)) + return -EINVAL; break; case Opt_gid: if (match_int(args, &option)) return -EINVAL; - asb->s_gid = option; + asb->s_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(asb->s_gid)) + return -EINVAL; break; case Opt_ownmask: if (match_octal(args, &option)) @@ -246,7 +251,6 @@ static struct inode *adfs_alloc_inode(struct super_block *sb) static void adfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } @@ -276,6 +280,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(adfs_inode_cachep); } @@ -370,8 +379,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = asb; /* set default options */ - asb->s_uid = 0; - asb->s_gid = 0; + asb->s_uid = GLOBAL_ROOT_UID; + asb->s_gid = GLOBAL_ROOT_GID; asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; asb->s_ftsuffix = 0; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 1fceb320d2f..3952121f2f2 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -3,6 +3,7 @@ #include <linux/buffer_head.h> #include <linux/amigaffs.h> #include <linux/mutex.h> +#include <linux/workqueue.h> /* AmigaOS allows file names with up to 30 characters length. * Names longer than that will be silently truncated. If you @@ -87,8 +88,8 @@ struct affs_sb_info { u32 s_root_block; /* FFS root block number. */ int s_hashsize; /* Size of hash table. */ unsigned long s_flags; /* See below. */ - uid_t s_uid; /* uid to override */ - gid_t s_gid; /* gid to override */ + kuid_t s_uid; /* uid to override */ + kgid_t s_gid; /* gid to override */ umode_t s_mode; /* mode to override */ struct buffer_head *s_root_bh; /* Cached root block. */ struct mutex s_bmlock; /* Protects bitmap access. */ @@ -100,6 +101,10 @@ struct affs_sb_info { char *s_prefix; /* Prefix for volumes and assigns. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ spinlock_t symlink_lock; /* protects the previous two */ + struct super_block *sb; /* the VFS superblock object */ + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work sb_work; /* superblock flush delayed work */ + spinlock_t work_lock; /* protects sb_work and work_queued */ }; #define SF_INTL 0x0001 /* International filesystem. */ @@ -120,6 +125,8 @@ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) return sb->s_fs_info; } +void affs_mark_sb_dirty(struct super_block *sb); + /* amigaffs.c */ extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); @@ -146,9 +153,9 @@ extern void affs_free_bitmap(struct super_block *sb); /* namei.c */ extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); -extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *); +extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *); +extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool); extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 52a6407682e..eb82ee53ee0 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -122,22 +122,16 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) } static void -affs_fix_dcache(struct dentry *dentry, u32 entry_ino) +affs_fix_dcache(struct inode *inode, u32 entry_ino) { - struct inode *inode = dentry->d_inode; - void *data = dentry->d_fsdata; - struct list_head *head, *next; - + struct dentry *dentry; + struct hlist_node *p; spin_lock(&inode->i_lock); - head = &inode->i_dentry; - next = head->next; - while (next != head) { - dentry = list_entry(next, struct dentry, d_alias); + hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { - dentry->d_fsdata = data; + dentry->d_fsdata = (void *)inode->i_ino; break; } - next = next->next; } spin_unlock(&inode->i_lock); } @@ -177,7 +171,11 @@ affs_remove_link(struct dentry *dentry) } affs_lock_dir(dir); - affs_fix_dcache(dentry, link_ino); + /* + * if there's a dentry for that block, make it + * refer to inode itself. + */ + affs_fix_dcache(inode, link_ino); retval = affs_remove_hash(dir, link_bh); if (retval) { affs_unlock_dir(dir); diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 3e262711ae0..a32246b8359 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -10,30 +10,6 @@ #include <linux/slab.h> #include "affs.h" -/* This is, of course, shamelessly stolen from fs/minix */ - -static const int nibblemap[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 }; - -static u32 -affs_count_free_bits(u32 blocksize, const void *data) -{ - const u32 *map; - u32 free; - u32 tmp; - - map = data; - free = 0; - for (blocksize /= 4; blocksize > 0; blocksize--) { - tmp = *map++; - while (tmp) { - free += nibblemap[tmp & 0xf]; - tmp >>= 4; - } - } - - return free; -} - u32 affs_count_free_blocks(struct super_block *sb) { @@ -103,7 +79,7 @@ affs_free_block(struct super_block *sb, u32 block) *(__be32 *)bh->b_data = cpu_to_be32(tmp - mask); mark_buffer_dirty(bh); - sb->s_dirt = 1; + affs_mark_sb_dirty(sb); bm->bm_free++; mutex_unlock(&sbi->s_bmlock); @@ -248,7 +224,7 @@ find_bit: *(__be32 *)bh->b_data = cpu_to_be32(tmp + mask); mark_buffer_dirty(bh); - sb->s_dirt = 1; + affs_mark_sb_dirty(sb); mutex_unlock(&sbi->s_bmlock); @@ -317,7 +293,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) goto out; } pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key); - bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4); + bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); /* Don't try read the extension if this is the last block, * but we also need the right bm pointer below @@ -367,7 +343,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) /* recalculate bitmap count for last block */ bm--; - bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4); + bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); out: affs_brelse(bh); diff --git a/fs/affs/file.c b/fs/affs/file.c index 2f4c935cb32..af3261b7810 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = { }; const struct inode_operations affs_file_inode_operations = { - .truncate = affs_truncate, .setattr = affs_notify_change, }; @@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page) return block_read_full_page(page, affs_get_block); } +static void affs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + affs_truncate(inode); + } +} + static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + affs_write_failed(mapping, pos + len); return ret; } diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 8bc4a59f4e7..0e092d08680 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -80,17 +80,17 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) if (id == 0 || sbi->s_flags & SF_SETUID) inode->i_uid = sbi->s_uid; else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) - inode->i_uid = 0; + i_uid_write(inode, 0); else - inode->i_uid = id; + i_uid_write(inode, id); id = be16_to_cpu(tail->gid); if (id == 0 || sbi->s_flags & SF_SETGID) inode->i_gid = sbi->s_gid; else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) - inode->i_gid = 0; + i_gid_write(inode, 0); else - inode->i_gid = id; + i_gid_write(inode, id); switch (be32_to_cpu(tail->stype)) { case ST_ROOT: @@ -193,13 +193,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) tail->size = cpu_to_be32(inode->i_size); secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change); if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { - uid = inode->i_uid; - gid = inode->i_gid; + uid = i_uid_read(inode); + gid = i_gid_read(inode); if (AFFS_SB(sb)->s_flags & SF_MUFS) { - if (inode->i_uid == 0 || inode->i_uid == 0xFFFF) - uid = inode->i_uid ^ ~0; - if (inode->i_gid == 0 || inode->i_gid == 0xFFFF) - gid = inode->i_gid ^ ~0; + if (uid == 0 || uid == 0xFFFF) + uid = uid ^ ~0; + if (gid == 0 || gid == 0xFFFF) + gid = gid ^ ~0; } if (!(AFFS_SB(sb)->s_flags & SF_SETUID)) tail->uid = cpu_to_be16(uid); @@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); + error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; + + truncate_setsize(inode, attr->ia_size); + affs_truncate(inode); } setattr_copy(inode, attr); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 47806940aac..ff65884a783 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -211,7 +211,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) } struct dentry * -affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; @@ -255,7 +255,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) +affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; diff --git a/fs/affs/super.c b/fs/affs/super.c index 0782653a05a..b84dc735250 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -17,6 +17,7 @@ #include <linux/magic.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/writeback.h> #include "affs.h" extern struct timezone sys_tz; @@ -25,15 +26,17 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_remount (struct super_block *sb, int *flags, char *data); static void -affs_commit_super(struct super_block *sb, int wait, int clean) +affs_commit_super(struct super_block *sb, int wait) { struct affs_sb_info *sbi = AFFS_SB(sb); struct buffer_head *bh = sbi->s_root_bh; struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); - tail->bm_flag = cpu_to_be32(clean); + lock_buffer(bh); secs_to_datestamp(get_seconds(), &tail->disk_change); affs_fix_checksum(sb, bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); if (wait) sync_dirty_buffer(bh); @@ -45,9 +48,7 @@ affs_put_super(struct super_block *sb) struct affs_sb_info *sbi = AFFS_SB(sb); pr_debug("AFFS: put_super()\n"); - if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt) - affs_commit_super(sb, 1, 1); - + cancel_delayed_work_sync(&sbi->sb_work); kfree(sbi->s_prefix); affs_free_bitmap(sb); affs_brelse(sbi->s_root_bh); @@ -55,26 +56,43 @@ affs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static void -affs_write_super(struct super_block *sb) +static int +affs_sync_fs(struct super_block *sb, int wait) +{ + affs_commit_super(sb, wait); + return 0; +} + +static void flush_superblock(struct work_struct *work) { - lock_super(sb); - if (!(sb->s_flags & MS_RDONLY)) - affs_commit_super(sb, 1, 2); - sb->s_dirt = 0; - unlock_super(sb); + struct affs_sb_info *sbi; + struct super_block *sb; - pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds()); + sbi = container_of(work, struct affs_sb_info, sb_work.work); + sb = sbi->sb; + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + affs_commit_super(sb, 1); } -static int -affs_sync_fs(struct super_block *sb, int wait) +void affs_mark_sb_dirty(struct super_block *sb) { - lock_super(sb); - affs_commit_super(sb, wait, 2); - sb->s_dirt = 0; - unlock_super(sb); - return 0; + struct affs_sb_info *sbi = AFFS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sb_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); } static struct kmem_cache * affs_inode_cachep; @@ -129,6 +147,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(affs_inode_cachep); } @@ -138,7 +161,6 @@ static const struct super_operations affs_sops = { .write_inode = affs_write_inode, .evict_inode = affs_evict_inode, .put_super = affs_put_super, - .write_super = affs_write_super, .sync_fs = affs_sync_fs, .statfs = affs_statfs, .remount_fs = affs_remount, @@ -171,7 +193,7 @@ static const match_table_t tokens = { }; static int -parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root, +parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root, int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) { char *p; @@ -236,13 +258,17 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s case Opt_setgid: if (match_int(&args[0], &option)) return 0; - *gid = option; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 0; *mount_opts |= SF_SETGID; break; case Opt_setuid: if (match_int(&args[0], &option)) return 0; - *uid = option; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 0; *mount_opts |= SF_SETUID; break; case Opt_verbose: @@ -284,8 +310,8 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) int num_bm; int i, j; s32 key; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; int reserved; unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ @@ -305,8 +331,11 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; + sbi->sb = sb; mutex_init(&sbi->s_bmlock); spin_lock_init(&sbi->symlink_lock); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, @@ -507,8 +536,8 @@ affs_remount(struct super_block *sb, int *flags, char *data) { struct affs_sb_info *sbi = AFFS_SB(sb); int blocksize; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; int mode; int reserved; int root_block; @@ -531,6 +560,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } + flush_delayed_work(&sbi->sb_work); replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; @@ -549,10 +579,9 @@ affs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; - if (*flags & MS_RDONLY) { - affs_write_super(sb); + if (*flags & MS_RDONLY) affs_free_bitmap(sb); - } else + else res = affs_init_bitmap(sb, flags); return res; diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 587ef5123cd..7ef637d7f3a 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -351,9 +351,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work) */ void afs_flush_callback_breaks(struct afs_server *server) { - cancel_delayed_work(&server->cb_break_work); - queue_delayed_work(afs_callback_update_worker, - &server->cb_break_work, 0); + mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0); } #if 0 diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e22dc4b4a50..db477906ba4 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -20,16 +20,16 @@ #include "internal.h" static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd); + unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); -static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); +static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd); + bool excl); static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); @@ -516,7 +516,7 @@ out: * look up an entry in a directory */ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct afs_vnode *vnode; struct afs_fid fid; @@ -598,7 +598,7 @@ success: * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ -static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct afs_vnode *vnode, *dir; struct afs_fid uninitialized_var(fid); @@ -607,7 +607,7 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) void *dir_version; int ret; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; vnode = AFS_FS_I(dentry->d_inode); @@ -949,7 +949,7 @@ error: * create a regular file on an AFS filesystem */ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct afs_file_status status; struct afs_callback cb; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 298cf8919ec..9682c33d5da 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -22,7 +22,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd); + unsigned int flags); static int afs_mntpt_open(struct inode *inode, struct file *file); static void afs_mntpt_expiry_timed_out(struct work_struct *work); @@ -104,7 +104,7 @@ out: */ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { _enter("%p,%p{%p{%s},%s}", dir, diff --git a/fs/afs/server.c b/fs/afs/server.c index d59b7516e94..f342acf3547 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -285,12 +285,7 @@ static void afs_reap_server(struct work_struct *work) expiry = server->time_of_death + afs_server_timeout; if (expiry > now) { delay = (expiry - now) * HZ; - if (!queue_delayed_work(afs_wq, &afs_server_reaper, - delay)) { - cancel_delayed_work(&afs_server_reaper); - queue_delayed_work(afs_wq, &afs_server_reaper, - delay); - } + mod_delayed_work(afs_wq, &afs_server_reaper, delay); break; } @@ -323,6 +318,5 @@ static void afs_reap_server(struct work_struct *work) void __exit afs_purge_servers(void) { afs_server_timeout = 0; - cancel_delayed_work(&afs_server_reaper); - queue_delayed_work(afs_wq, &afs_server_reaper, 0); + mod_delayed_work(afs_wq, &afs_server_reaper, 0); } diff --git a/fs/afs/super.c b/fs/afs/super.c index f02b31e7e64..43165009428 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -123,6 +123,11 @@ void __exit afs_fs_exit(void) BUG(); } + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(afs_inode_cachep); _leave(""); } @@ -395,7 +400,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, as->volume = vol; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, afs_set_super, as); + sb = sget(fs_type, afs_test_super, afs_set_super, flags, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); afs_put_volume(vol); @@ -406,7 +411,6 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - sb->s_flags = flags; ret = afs_fill_super(sb, ¶ms); if (ret < 0) { deactivate_locked_super(sb); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 431984d2e37..57bcb159653 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -561,12 +561,7 @@ static void afs_vlocation_reaper(struct work_struct *work) if (expiry > now) { delay = (expiry - now) * HZ; _debug("delay %lu", delay); - if (!queue_delayed_work(afs_wq, &afs_vlocation_reap, - delay)) { - cancel_delayed_work(&afs_vlocation_reap); - queue_delayed_work(afs_wq, &afs_vlocation_reap, - delay); - } + mod_delayed_work(afs_wq, &afs_vlocation_reap, delay); break; } @@ -614,13 +609,10 @@ void afs_vlocation_purge(void) spin_lock(&afs_vlocation_updates_lock); list_del_init(&afs_vlocation_updates); spin_unlock(&afs_vlocation_updates_lock); - cancel_delayed_work(&afs_vlocation_update); - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, 0); + mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0); destroy_workqueue(afs_vlocation_update_worker); - cancel_delayed_work(&afs_vlocation_reap); - queue_delayed_work(afs_wq, &afs_vlocation_reap, 0); + mod_delayed_work(afs_wq, &afs_vlocation_reap, 0); } /* @@ -56,13 +56,6 @@ static struct kmem_cache *kioctx_cachep; static struct workqueue_struct *aio_wq; -/* Used for rare fput completion. */ -static void aio_fput_routine(struct work_struct *); -static DECLARE_WORK(fput_work, aio_fput_routine); - -static DEFINE_SPINLOCK(fput_lock); -static LIST_HEAD(fput_head); - static void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); @@ -479,7 +472,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) { unsigned short allocated, to_alloc; long avail; - bool called_fput = false; struct kiocb *req, *n; struct aio_ring *ring; @@ -495,28 +487,11 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) if (allocated == 0) goto out; -retry: spin_lock_irq(&ctx->ctx_lock); ring = kmap_atomic(ctx->ring_info.ring_pages[0]); avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; BUG_ON(avail < 0); - if (avail == 0 && !called_fput) { - /* - * Handle a potential starvation case. It is possible that - * we hold the last reference on a struct file, causing us - * to delay the final fput to non-irq context. In this case, - * ctx->reqs_active is artificially high. Calling the fput - * routine here may free up a slot in the event completion - * ring, allowing this allocation to succeed. - */ - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); - aio_fput_routine(NULL); - called_fput = true; - goto retry; - } - if (avail < allocated) { /* Trim back the number of requests. */ list_for_each_entry_safe(req, n, &batch->head, ki_batch) { @@ -570,36 +545,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) wake_up_all(&ctx->wait); } -static void aio_fput_routine(struct work_struct *data) -{ - spin_lock_irq(&fput_lock); - while (likely(!list_empty(&fput_head))) { - struct kiocb *req = list_kiocb(fput_head.next); - struct kioctx *ctx = req->ki_ctx; - - list_del(&req->ki_list); - spin_unlock_irq(&fput_lock); - - /* Complete the fput(s) */ - if (req->ki_filp != NULL) - fput(req->ki_filp); - - /* Link the iocb into the context's free list */ - rcu_read_lock(); - spin_lock_irq(&ctx->ctx_lock); - really_put_req(ctx, req); - /* - * at that point ctx might've been killed, but actual - * freeing is RCU'd - */ - spin_unlock_irq(&ctx->ctx_lock); - rcu_read_unlock(); - - spin_lock_irq(&fput_lock); - } - spin_unlock_irq(&fput_lock); -} - /* __aio_put_req * Returns true if this put was the last user of the request. */ @@ -618,21 +563,9 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - /* - * Try to optimize the aio and eventfd file* puts, by avoiding to - * schedule work in case it is not final fput() time. In normal cases, - * we would not be holding the last reference to the file*, so - * this function will be executed w/out any aio kthread wakeup. - */ - if (unlikely(!fput_atomic(req->ki_filp))) { - spin_lock(&fput_lock); - list_add(&req->ki_list, &fput_head); - spin_unlock(&fput_lock); - schedule_work(&fput_work); - } else { - req->ki_filp = NULL; - really_put_req(ctx, req); - } + fput(req->ki_filp); + req->ki_filp = NULL; + really_put_req(ctx, req); return 1; } diff --git a/fs/attr.c b/fs/attr.c index 0da90951d27..1449adb14ef 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -14,6 +14,7 @@ #include <linux/fcntl.h> #include <linux/security.h> #include <linux/evm.h> +#include <linux/ima.h> /** * inode_change_ok - check if attribute changes to an inode are allowed @@ -48,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || - !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) + !uid_eq(attr->ia_uid, inode->i_uid)) && + !inode_capable(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !capable(CAP_CHOWN)) + !inode_capable(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -64,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) return -EPERM; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && !capable(CAP_FSETID)) + inode->i_gid) && + !inode_capable(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -156,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !inode_capable(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } @@ -171,6 +175,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr) struct timespec now; unsigned int ia_valid = attr->ia_valid; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -245,10 +251,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr) if (!error) { fsnotify_change(dentry, ia_valid); + ima_inode_post_setattr(dentry); evm_inode_post_setattr(dentry, ia_valid); } return error; } - EXPORT_SYMBOL(notify_change); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 908e1845541..b785e770795 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -74,8 +74,8 @@ struct autofs_info { unsigned long last_used; atomic_t count; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ @@ -89,8 +89,8 @@ struct autofs_wait_queue { struct qstr name; u32 dev; u64 ino; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; pid_t pid; pid_t tgid; /* This is for status reporting upon return */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index aa9103f8f01..9f68a37bb2b 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -221,20 +221,6 @@ static int test_by_type(struct path *path, void *p) return ino && ino->sbi->type & *(unsigned *)p; } -static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - __set_close_on_exec(fd, fdt); - spin_unlock(&files->file_lock); -} - - /* * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). @@ -243,7 +229,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { int err, fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(O_CLOEXEC); if (likely(fd >= 0)) { struct file *filp; struct path path; @@ -257,14 +243,14 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) * corresponding to the autofs fs we want to open. */ - filp = dentry_open(path.dentry, path.mnt, O_RDONLY, - current_cred()); + filp = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } - autofs_dev_ioctl_fd_install(fd, filp); + fd_install(fd, filp); } return fd; @@ -451,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(path.dentry); spin_lock(&sbi->fs_lock); - param->requester.uid = ino->uid; - param->requester.gid = ino->gid; + param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 1feb68ecef9..01443ce43ee 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -94,25 +94,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev, { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; - struct dentry *p, *q; + struct dentry *q; spin_lock(&sbi->lookup_lock); + spin_lock(&root->d_lock); - if (prev == NULL) { - spin_lock(&root->d_lock); + if (prev) + next = prev->d_u.d_child.next; + else { prev = dget_dlock(root); next = prev->d_subdirs.next; - p = prev; - goto start; } - p = prev; - spin_lock(&p->d_lock); -again: - next = p->d_u.d_child.next; -start: +cont: if (next == &root->d_subdirs) { - spin_unlock(&p->d_lock); + spin_unlock(&root->d_lock); spin_unlock(&sbi->lookup_lock); dput(prev); return NULL; @@ -121,16 +117,15 @@ start: q = list_entry(next, struct dentry, d_u.d_child); spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); - /* Negative dentry - try next */ - if (!simple_positive(q)) { - spin_unlock(&p->d_lock); - lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); - p = q; - goto again; + /* Already gone or negative dentry (under construction) - try next */ + if (q->d_count == 0 || !simple_positive(q)) { + spin_unlock(&q->d_lock); + next = q->d_u.d_child.next; + goto cont; } dget_dlock(q); spin_unlock(&q->d_lock); - spin_unlock(&p->d_lock); + spin_unlock(&root->d_lock); spin_unlock(&sbi->lookup_lock); dput(prev); @@ -404,11 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, DPRINTK("checking mountpoint %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 2; - if (dentry->d_count > ino_count) - goto next; - /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) goto next; @@ -558,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_EXPIRING; - spin_lock(&dentry->d_lock); - if (!ret) { - if ((IS_ROOT(dentry) || - (autofs_type_indirect(sbi->type) && - IS_ROOT(dentry->d_parent))) && - !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - __managed_dentry_set_automount(dentry); - } - spin_unlock(&dentry->d_lock); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 8a4fed8ead3..b104726e2d0 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) void autofs4_clean_ino(struct autofs_info *ino) { - ino->uid = 0; - ino->gid = 0; + ino->uid = GLOBAL_ROOT_UID; + ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } @@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); - if (root_inode->i_uid != 0) - seq_printf(m, ",uid=%u", root_inode->i_uid); - if (root_inode->i_gid != 0) - seq_printf(m, ",gid=%u", root_inode->i_gid); + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, root_inode->i_uid)); + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); @@ -126,7 +128,7 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, +static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) { char *p; @@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, case Opt_uid: if (match_int(args, &option)) return 1; - *uid = option; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 1; break; case Opt_gid: if (match_int(args, &option)) return 1; - *gid = option; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 1; break; case Opt_pgrp: if (match_int(args, &option)) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 75e5f1c8e02..c93447604da 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -32,7 +32,7 @@ static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); #endif static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); +static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); static int autofs4_d_manage(struct dentry *, bool); static void autofs4_dentry_release(struct dentry *); @@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - spin_lock(&dentry->d_lock); - if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); + if (!d_mountpoint(dentry) && simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } - spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); out: @@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path) status = autofs4_mount_wait(dentry); if (status) return ERR_PTR(status); - spin_lock(&sbi->fs_lock); goto done; } @@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + spin_unlock(&sbi->fs_lock); goto done; + } + if (!d_mountpoint(dentry)) { /* * It's possible that user space hasn't removed directories @@ -379,46 +378,26 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * require user space behave. */ if (sbi->version > 4) { - if (have_submounts(dentry)) + if (have_submounts(dentry)) { + spin_unlock(&sbi->fs_lock); goto done; + } } else { - spin_lock(&dentry->d_lock); - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); + if (!simple_empty(dentry)) goto done; - } - spin_unlock(&dentry->d_lock); } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); status = autofs4_mount_wait(dentry); - if (status) - return ERR_PTR(status); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; - } -done: - if (!(ino->flags & AUTOFS_INF_EXPIRING)) { - /* - * Any needed mounting has been completed and the path - * updated so clear DCACHE_NEED_AUTOMOUNT so we don't - * call ->d_automount() on rootless multi-mounts since - * it can lead to an incorrect ELOOP error return. - * - * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and - * symlinks as in all other cases the dentry will be covered by - * an actual mount so ->d_automount() won't be called during - * the follow. - */ - spin_lock(&dentry->d_lock); - if ((!d_mountpoint(dentry) && - !list_empty(&dentry->d_subdirs)) || - (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) - __managed_dentry_clear_automount(dentry); - spin_unlock(&dentry->d_lock); + if (status) { + spin_unlock(&sbi->fs_lock); + return ERR_PTR(status); + } } spin_unlock(&sbi->fs_lock); - +done: /* Mount succeeded, check if we ended up with a new dentry */ dentry = autofs4_mountpoint_changed(path); if (!dentry) @@ -430,6 +409,8 @@ done: int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; DPRINTK("dentry=%p %.*s", dentry, dentry->d_name.len, dentry->d_name.name); @@ -454,11 +435,36 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - return autofs4_mount_wait(dentry); + status = autofs4_mount_wait(dentry); + if (status) + return status; + + spin_lock(&sbi->fs_lock); + /* + * If the dentry has been selected for expire while we slept + * on the lock then it might go away. We'll deal with that in + * ->d_automount() and wait on a new mount if the expire + * succeeds or return here if it doesn't (since there's no + * mount to follow with a rootless multi-mount). + */ + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path + * updated so check if this is a rootless multi-mount so + * we can avoid needless calls ->d_automount() and avoid + * an incorrect ELOOP error return. + */ + if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + status = -EISDIR; + } + spin_unlock(&sbi->fs_lock); + + return status; } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -597,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) spin_lock(&sbi->lookup_lock); __autofs4_add_expiring(dentry); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); + d_drop(dentry); spin_unlock(&sbi->lookup_lock); return 0; @@ -670,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return -EACCES; spin_lock(&sbi->lookup_lock); - spin_lock(&dentry->d_lock); - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); + if (!simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOTEMPTY; } __autofs4_add_expiring(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); + d_drop(dentry); spin_unlock(&sbi->lookup_lock); if (sbi->version < 5) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index da8876d38a7..03bc1d347d8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, case autofs_ptype_expire_direct: { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; pktsz = sizeof(*packet); @@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; - packet->uid = wq->uid; - packet->gid = wq->gid; + packet->uid = from_kuid_munged(user_ns, wq->uid); + packet->gid = from_kgid_munged(user_ns, wq->gid); packet->pid = wq->pid; packet->tgid = wq->tgid; break; @@ -175,8 +176,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, return; } - pipe = sbi->pipe; - get_file(pipe); + pipe = get_file(sbi->pipe); mutex_unlock(&sbi->wq_mutex); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 1b35d6bd06b..922ad460bff 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -16,7 +16,7 @@ #include <linux/poll.h> -static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence) { return -EIO; } @@ -173,13 +173,13 @@ static const struct file_operations bad_file_ops = }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } diff --git a/fs/befs/befs.h b/fs/befs/befs.h index d9a40abda6b..b2664283915 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -20,8 +20,8 @@ typedef u64 befs_blocknr_t; */ typedef struct befs_mount_options { - gid_t gid; - uid_t uid; + kgid_t gid; + kuid_t uid; int use_gid; int use_uid; int debug; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index e18da23d42b..2b3bda8d5e6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -15,6 +15,7 @@ #include <linux/vfs.h> #include <linux/parser.h> #include <linux/namei.h> +#include <linux/sched.h> #include "befs.h" #include "btree.h" @@ -34,7 +35,7 @@ static int befs_readdir(struct file *, void *, filldir_t); static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); static int befs_readpage(struct file *file, struct page *page); static sector_t befs_bmap(struct address_space *mapping, sector_t block); -static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int); static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); @@ -159,7 +160,7 @@ befs_get_block(struct inode *inode, sector_t block, } static struct dentry * -befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct super_block *sb = dir->i_sb; @@ -352,9 +353,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) */ inode->i_uid = befs_sb->mount_opts.use_uid ? - befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid); + befs_sb->mount_opts.uid : + make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid)); inode->i_gid = befs_sb->mount_opts.use_gid ? - befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid); + befs_sb->mount_opts.gid : + make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid)); set_nlink(inode, 1); @@ -454,6 +457,11 @@ befs_init_inodecache(void) static void befs_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(befs_inode_cachep); } @@ -674,10 +682,12 @@ parse_options(char *options, befs_mount_options * opts) char *p; substring_t args[MAX_OPT_ARGS]; int option; + kuid_t uid; + kgid_t gid; /* Initialize options */ - opts->uid = 0; - opts->gid = 0; + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; opts->use_uid = 0; opts->use_gid = 0; opts->iocharset = NULL; @@ -696,23 +706,29 @@ parse_options(char *options, befs_mount_options * opts) case Opt_uid: if (match_int(&args[0], &option)) return 0; - if (option < 0) { + uid = INVALID_UID; + if (option >= 0) + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { printk(KERN_ERR "BeFS: Invalid uid %d, " "using default\n", option); break; } - opts->uid = option; + opts->uid = uid; opts->use_uid = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; - if (option < 0) { + gid = INVALID_GID; + if (option >= 0) + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { printk(KERN_ERR "BeFS: Invalid gid %d, " "using default\n", option); break; } - opts->gid = option; + opts->gid = gid; opts->use_gid = 1; break; case Opt_charset: diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index d12c7966db2..2785ef91191 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -85,7 +85,7 @@ const struct file_operations bfs_dir_operations = { extern void dump_imap(const char *, struct super_block *); static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { int err; struct inode *inode; @@ -133,7 +133,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode = NULL; struct buffer_head *bh; diff --git a/fs/bfs/file.c b/fs/bfs/file.c index f20e8a71062..ad3ea1497cc 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, bfs_get_block); } +static void bfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) + truncate_pagecache(inode, to, inode->i_size); +} + static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping, ret = block_write_begin(mapping, pos, len, flags, pagep, bfs_get_block); - if (unlikely(ret)) { - loff_t isize = mapping->host->i_size; - if (pos + len > isize) - vmtruncate(mapping->host, isize); - } + if (unlikely(ret)) + bfs_write_failed(mapping, pos + len); return ret; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 9870417c26e..737aaa3f709 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -76,8 +76,8 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); - inode->i_uid = le32_to_cpu(di->i_uid); - inode->i_gid = le32_to_cpu(di->i_gid); + i_uid_write(inode, le32_to_cpu(di->i_uid)); + i_gid_write(inode, le32_to_cpu(di->i_gid)); set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); @@ -139,8 +139,8 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_ino = cpu_to_le16(ino); di->i_mode = cpu_to_le32(inode->i_mode); - di->i_uid = cpu_to_le32(inode->i_uid); - di->i_gid = cpu_to_le32(inode->i_gid); + di->i_uid = cpu_to_le32(i_uid_read(inode)); + di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -280,6 +280,11 @@ static int init_inodecache(void) static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(bfs_inode_cachep); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index d146e181d10..6043567b95c 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -30,33 +30,10 @@ #include <asm/cacheflush.h> #include <asm/a.out-core.h> -static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_aout_binary(struct linux_binprm *); static int load_aout_library(struct file*); -static int aout_core_dump(struct coredump_params *cprm); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, - .core_dump = aout_core_dump, - .min_coredump = PAGE_SIZE -}; - -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; - } - return 0; -} +#ifdef CONFIG_COREDUMP /* * Routine writes a core dump image in the current directory. * Currently only a stub-function. @@ -66,7 +43,6 @@ static int set_brk(unsigned long start, unsigned long end) * field, which also makes sure the core-dumps won't be recursive if the * dumping of the process results in another error.. */ - static int aout_core_dump(struct coredump_params *cprm) { struct file *file = cprm->file; @@ -89,7 +65,7 @@ static int aout_core_dump(struct coredump_params *cprm) current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); - dump.signal = cprm->signr; + dump.signal = cprm->siginfo->si_signo; aout_dump_thread(cprm->regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen @@ -135,6 +111,32 @@ end_coredump: set_fs(fs); return has_dumped; } +#else +#define aout_core_dump NULL +#endif + +static struct linux_binfmt aout_format = { + .module = THIS_MODULE, + .load_binary = load_aout_binary, + .load_shlib = load_aout_library, + .core_dump = aout_core_dump, + .min_coredump = PAGE_SIZE +}; + +#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) + +static int set_brk(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN(start); + end = PAGE_ALIGN(end); + if (end > start) { + unsigned long addr; + addr = vm_brk(start, end - start); + if (BAD_ADDR(addr)) + return addr; + } + return 0; +} /* * create_aout_tables() parses the env- and arg-strings in new user @@ -199,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin * libraries. There is no binary dependent code anywhere else. */ -static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_aout_binary(struct linux_binprm * bprm) { + struct pt_regs *regs = current_pt_regs(); struct exec ex; unsigned long error; unsigned long fd_offset; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1b52956afe3..0c42cdbabec 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -27,6 +27,7 @@ #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/vmalloc.h> #include <linux/security.h> #include <linux/random.h> #include <linux/elf.h> @@ -35,9 +36,15 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> -#include <asm/exec.h> -static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); +#ifndef user_long_t +#define user_long_t long +#endif +#ifndef user_siginfo_t +#define user_siginfo_t siginfo_t +#endif + +static int load_elf_binary(struct linux_binprm *bprm); static int load_elf_library(struct file *); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); @@ -551,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) #endif } -static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_addr = 0, load_bias = 0; @@ -568,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; unsigned long def_flags = 0; + struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; struct elfhdr interp_elf_ex; @@ -881,7 +889,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } if (elf_interpreter) { - unsigned long uninitialized_var(interp_map_addr); + unsigned long interp_map_addr = 0; elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, @@ -1115,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, if (always_dump_vma(vma)) goto whole; - if (vma->vm_flags & VM_NODUMP) + if (vma->vm_flags & VM_DONTDUMP) return 0; /* Hugetlb memory check */ @@ -1127,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (vma->vm_flags & VM_IO) return 0; /* By default, dump shared memory if mapped from an anonymous file. */ @@ -1372,6 +1380,103 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); } +static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, + siginfo_t *siginfo) +{ + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); + set_fs(old_fs); + fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); +} + +#define MAX_FILE_NOTE_SIZE (4*1024*1024) +/* + * Format of NT_FILE note: + * + * long count -- how many files are mapped + * long page_size -- units for file_ofs + * array of [COUNT] elements of + * long start + * long end + * long file_ofs + * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... + */ +static void fill_files_note(struct memelfnote *note) +{ + struct vm_area_struct *vma; + unsigned count, size, names_ofs, remaining, n; + user_long_t *data; + user_long_t *start_end_ofs; + char *name_base, *name_curpos; + + /* *Estimated* file count and total data size needed */ + count = current->mm->map_count; + size = count * 64; + + names_ofs = (2 + 3 * count) * sizeof(data[0]); + alloc: + if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ + goto err; + size = round_up(size, PAGE_SIZE); + data = vmalloc(size); + if (!data) + goto err; + + start_end_ofs = data + 2; + name_base = name_curpos = ((char *)data) + names_ofs; + remaining = size - names_ofs; + count = 0; + for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + struct file *file; + const char *filename; + + file = vma->vm_file; + if (!file) + continue; + filename = d_path(&file->f_path, name_curpos, remaining); + if (IS_ERR(filename)) { + if (PTR_ERR(filename) == -ENAMETOOLONG) { + vfree(data); + size = size * 5 / 4; + goto alloc; + } + continue; + } + + /* d_path() fills at the end, move name down */ + /* n = strlen(filename) + 1: */ + n = (name_curpos + remaining) - filename; + remaining = filename - name_curpos; + memmove(name_curpos, filename, n); + name_curpos += n; + + *start_end_ofs++ = vma->vm_start; + *start_end_ofs++ = vma->vm_end; + *start_end_ofs++ = vma->vm_pgoff; + count++; + } + + /* Now we know exact count of files, can store it */ + data[0] = count; + data[1] = PAGE_SIZE; + /* + * Count usually is less than current->mm->map_count, + * we need to move filenames down. + */ + n = current->mm->map_count - count; + if (n != 0) { + unsigned shift_bytes = n * 3 * sizeof(data[0]); + memmove(name_base - shift_bytes, name_base, + name_curpos - name_base); + name_curpos -= shift_bytes; + } + + size = name_curpos - (char *)data; + fill_note(note, "CORE", NT_FILE, size, data); + err: ; +} + #ifdef CORE_DUMP_USE_REGSET #include <linux/regset.h> @@ -1385,7 +1490,10 @@ struct elf_thread_core_info { struct elf_note_info { struct elf_thread_core_info *thread; struct memelfnote psinfo; + struct memelfnote signote; struct memelfnote auxv; + struct memelfnote files; + user_siginfo_t csigdata; size_t size; int thread_notes; }; @@ -1480,7 +1588,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - long signr, struct pt_regs *regs) + siginfo_t *siginfo, struct pt_regs *regs) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1493,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, info->thread = NULL; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); - if (psinfo == NULL) + if (psinfo == NULL) { + info->psinfo.data = NULL; /* So we don't free this wrongly */ return 0; + } fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); @@ -1550,7 +1660,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, signr, &info->size)) + if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) return 0; /* @@ -1559,9 +1669,15 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); + fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + info->size += notesize(&info->signote); + fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); + fill_files_note(&info->files); + info->size += notesize(&info->files); + return 1; } @@ -1588,8 +1704,12 @@ static int write_note_info(struct elf_note_info *info, if (first && !writenote(&info->psinfo, file, foffset)) return 0; + if (first && !writenote(&info->signote, file, foffset)) + return 0; if (first && !writenote(&info->auxv, file, foffset)) return 0; + if (first && !writenote(&info->files, file, foffset)) + return 0; for (i = 1; i < info->thread_notes; ++i) if (t->notes[i].data && @@ -1616,6 +1736,7 @@ static void free_note_info(struct elf_note_info *info) kfree(t); } kfree(info->psinfo.data); + vfree(info->files.data); } #else @@ -1681,6 +1802,7 @@ struct elf_note_info { #ifdef ELF_CORE_COPY_XFPREGS elf_fpxregset_t *xfpu; #endif + user_siginfo_t csigdata; int thread_status_size; int numnote; }; @@ -1690,48 +1812,37 @@ static int elf_note_info_init(struct elf_note_info *info) memset(info, 0, sizeof(*info)); INIT_LIST_HEAD(&info->thread_list); - /* Allocate space for six ELF notes */ - info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL); + /* Allocate space for ELF notes */ + info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL); if (!info->notes) return 0; info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); if (!info->psinfo) - goto notes_free; + return 0; info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); if (!info->prstatus) - goto psinfo_free; + return 0; info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); if (!info->fpu) - goto prstatus_free; + return 0; #ifdef ELF_CORE_COPY_XFPREGS info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); if (!info->xfpu) - goto fpu_free; + return 0; #endif return 1; -#ifdef ELF_CORE_COPY_XFPREGS - fpu_free: - kfree(info->fpu); -#endif - prstatus_free: - kfree(info->prstatus); - psinfo_free: - kfree(info->psinfo); - notes_free: - kfree(info->notes); - return 0; } static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - long signr, struct pt_regs *regs) + siginfo_t *siginfo, struct pt_regs *regs) { struct list_head *t; if (!elf_note_info_init(info)) return 0; - if (signr) { + if (siginfo->si_signo) { struct core_thread *ct; struct elf_thread_status *ets; @@ -1749,13 +1860,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, int sz; ets = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, ets); + sz = elf_dump_thread_status(siginfo->si_signo, ets); info->thread_status_size += sz; } } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(info->prstatus, current, signr); + fill_prstatus(info->prstatus, current, siginfo->si_signo); elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ @@ -1772,9 +1883,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - info->numnote = 2; + fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_auxv_note(info->notes + 3, current->mm); + fill_files_note(info->notes + 4); - fill_auxv_note(&info->notes[info->numnote++], current->mm); + info->numnote = 5; /* Try to dump the FPU. */ info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, @@ -1836,6 +1949,9 @@ static void free_note_info(struct elf_note_info *info) kfree(list_entry(tmp, struct elf_thread_status, list)); } + /* Free data allocated by fill_files_note(): */ + vfree(info->notes[4].data); + kfree(info->prstatus); kfree(info->psinfo); kfree(info->notes); @@ -1962,7 +2078,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs)) + if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) goto cleanup; has_dumped = 1; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3d77cf81ba3..dc84732e554 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,7 +39,6 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/pgalloc.h> -#include <asm/exec.h> typedef char *elf_caddr_t; @@ -57,7 +56,7 @@ typedef char *elf_caddr_t; MODULE_LICENSE("GPL"); -static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); +static int load_elf_fdpic_binary(struct linux_binprm *); static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, struct mm_struct *, const char *); @@ -165,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, /* * load an fdpic binary into various bits of memory */ -static int load_elf_fdpic_binary(struct linux_binprm *bprm, - struct pt_regs *regs) +static int load_elf_fdpic_binary(struct linux_binprm *bprm) { struct elf_fdpic_params exec_params, interp_params; + struct pt_regs *regs = current_pt_regs(); struct elf_phdr *phdr; unsigned long stack_size, entryaddr; #ifdef ELF_FDPIC_PLAT_INIT @@ -1205,7 +1204,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) int dump_ok; /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + if (vma->vm_flags & VM_IO) { kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); return 0; } @@ -1642,7 +1641,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto cleanup; #endif - if (cprm->signr) { + if (cprm->siginfo->si_signo) { struct core_thread *ct; struct elf_thread_status *tmp; @@ -1661,13 +1660,13 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) int sz; tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(cprm->signr, tmp); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp); thread_status_size += sz; } } /* now collect the dump for the current */ - fill_prstatus(prstatus, current, cprm->signr); + fill_prstatus(prstatus, current, cprm->siginfo->si_signo); elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 2790c7e1912..037a3e2b045 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -22,7 +22,7 @@ #define EM86_INTERP "/usr/bin/em86" #define EM86_I_NAME "em86" -static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) +static int load_em86(struct linux_binprm *bprm) { char *interp, *i_name, *i_arg; struct file * file; @@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) return -ENOEXEC; } - bprm->recursion_depth++; /* Well, the bang-shell is implicit... */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; @@ -90,7 +89,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) if (retval < 0) return retval; - return search_binary_handler(bprm, regs); + return search_binary_handler(bprm); } static struct linux_binfmt em86_format = { diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 178cb70acc2..b56371981d1 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -88,7 +88,7 @@ struct lib_info { static int load_flat_shared_library(int id, struct lib_info *p); #endif -static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_flat_binary(struct linux_binprm *); static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { @@ -107,7 +107,7 @@ static struct linux_binfmt flat_format = { static int flat_core_dump(struct coredump_params *cprm) { printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) cprm->signr); + current->comm, current->pid, (int) cprm->siginfo->si_signo); return(1); } @@ -858,9 +858,10 @@ out: * libraries. There is no binary dependent code anywhere else. */ -static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_flat_binary(struct linux_binprm * bprm) { struct lib_info libinfo; + struct pt_regs *regs = current_pt_regs(); unsigned long p = bprm->p; unsigned long stack_len; unsigned long start_addr; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 790b3cddca6..0c8869fdd14 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm) /* * the loader itself */ -static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file * interp_file = NULL; @@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!enabled) goto _ret; - retval = -ENOEXEC; - if (bprm->recursion_depth > BINPRM_MAX_RECURSION) - goto _ret; - /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); @@ -176,7 +172,10 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto _error; bprm->argc ++; - bprm->interp = iname; /* for binfmt_script */ + /* Update interp in case binfmt_script needs it. */ + retval = bprm_change_interp(iname, bprm); + if (retval < 0) + goto _error; interp_file = open_exec (iname); retval = PTR_ERR (interp_file); @@ -197,9 +196,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; - bprm->recursion_depth++; - - retval = search_binary_handler (bprm, regs); + retval = search_binary_handler(bprm); if (retval < 0) goto _error; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index d3b8c1f6315..5027a3e1492 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -14,7 +14,7 @@ #include <linux/err.h> #include <linux/fs.h> -static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) +static int load_script(struct linux_binprm *bprm) { const char *i_arg, *i_name; char *cp; @@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) char interp[BINPRM_BUF_SIZE]; int retval; - if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || - (bprm->recursion_depth > BINPRM_MAX_RECURSION)) + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT */ - bprm->recursion_depth++; allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; @@ -82,7 +80,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) retval = copy_strings_kernel(1, &i_name, bprm); if (retval) return retval; bprm->argc++; - bprm->interp = interp; + retval = bprm_change_interp(interp, bprm); + if (retval < 0) + return retval; /* * OK, now restart the process with the interpreter's dentry. @@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) retval = prepare_binprm(bprm); if (retval < 0) return retval; - return search_binary_handler(bprm,regs); + return search_binary_handler(bprm); } static struct linux_binfmt script_format = { diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 4517aaff61b..4e00ed68d4a 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -35,7 +35,7 @@ #include <linux/elf.h> -static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); +static int load_som_binary(struct linux_binprm * bprm); static int load_som_library(struct file *); /* @@ -180,13 +180,14 @@ out: */ static int -load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) +load_som_binary(struct linux_binprm * bprm) { int retval; unsigned int size; unsigned long som_entry; struct som_hdr *som_ex; struct som_exec_auxhdr *hpuxhdr; + struct pt_regs *regs = current_pt_regs(); /* Get the exec-header */ som_ex = (struct som_hdr *) bprm->buf; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index e85c04b9f61..a3f28f331b2 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx) } /** - * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements - * @bs: bio_set to allocate from * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs, - struct bio_set *bs) +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) { struct bio_integrity_payload *bip; unsigned int idx = vecs_to_idx(nr_vecs); + struct bio_set *bs = bio->bi_pool; + + if (!bs) + bs = fs_bio_set; BUG_ON(bio == NULL); bip = NULL; @@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, return bip; } -EXPORT_SYMBOL(bio_integrity_alloc_bioset); - -/** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio - * @bio: bio to attach integrity metadata to - * @gfp_mask: Memory allocation mask - * @nr_vecs: Number of integrity metadata scatter-gather elements - * - * Description: This function prepares a bio for attaching integrity - * metadata. nr_vecs specifies the maximum number of pages containing - * integrity metadata that can be attached. - */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) -{ - return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); -} EXPORT_SYMBOL(bio_integrity_alloc); /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed - * @bs: bio_set this bio was allocated from * * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ -void bio_integrity_free(struct bio *bio, struct bio_set *bs) +void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_set *bs = bio->bi_pool; + + if (!bs) + bs = fs_bio_set; BUG_ON(bip == NULL); @@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split); * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask - * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask, struct bio_set *bs) + gfp_t gfp_mask) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); if (bip == NULL) return -EIO; @@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { * IO code that does not need private memory pools. */ struct bio_set *fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management @@ -73,7 +74,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) { unsigned int sz = sizeof(struct bio) + extra_size; struct kmem_cache *slab = NULL; - struct bio_slab *bslab; + struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; unsigned int i, entry = -1; mutex_lock(&bio_slab_lock); @@ -96,12 +98,14 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) goto out_unlock; if (bio_slab_nr == bio_slab_max && entry == -1) { - bio_slab_max <<= 1; - bio_slabs = krealloc(bio_slabs, - bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!bio_slabs) + new_bio_slab_max = bio_slab_max << 1; + new_bio_slabs = krealloc(bio_slabs, + new_bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!new_bio_slabs) goto out_unlock; + bio_slab_max = new_bio_slab_max; + bio_slabs = new_bio_slabs; } if (entry == -1) entry = bio_slab_nr++; @@ -232,26 +236,37 @@ fallback: return bvl; } -void bio_free(struct bio *bio, struct bio_set *bs) +static void __bio_free(struct bio *bio) { + bio_disassociate_task(bio); + + if (bio_integrity(bio)) + bio_integrity_free(bio); +} + +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; void *p; - if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + __bio_free(bio); - if (bio_integrity(bio)) - bio_integrity_free(bio, bs); + if (bs) { + if (bio_has_allocated_vec(bio)) + bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - if (bs->front_pad) + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; p -= bs->front_pad; - mempool_free(p, bs->bio_pool); + mempool_free(p, bs->bio_pool); + } else { + /* Bio was allocated by bio_kmalloc() */ + kfree(bio); + } } -EXPORT_SYMBOL(bio_free); void bio_init(struct bio *bio) { @@ -262,48 +277,85 @@ void bio_init(struct bio *bio) EXPORT_SYMBOL(bio_init); /** + * bio_reset - reinitialize a bio + * @bio: bio to reset + * + * Description: + * After calling bio_reset(), @bio will be in the same state as a freshly + * allocated bio returned bio bio_alloc_bioset() - the only fields that are + * preserved are the ones that are initialized by bio_alloc_bioset(). See + * comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + __bio_free(bio); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags|(1 << BIO_UPTODATE); +} +EXPORT_SYMBOL(bio_reset); + +/** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * * Description: - * bio_alloc_bioset will try its own mempool to satisfy the allocation. - * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. * - * Note that the caller must set ->bi_destructor on successful return - * of a bio, to do the appropriate freeing of the bio once the reference - * count drops to zero. - **/ + * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be + * able to allocate a bio. This is due to the mempool guarantees. To make this + * work, callers must never allocate more than 1 bio at a time from this pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + unsigned front_pad; + unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; struct bio *bio; void *p; - p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + p = kmalloc(sizeof(struct bio) + + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + p = mempool_alloc(bs->bio_pool, gfp_mask); + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; + } + if (unlikely(!p)) return NULL; - bio = p + bs->front_pad; + bio = p + front_pad; bio_init(bio); - if (unlikely(!nr_iovecs)) - goto out_set; - - if (nr_iovecs <= BIO_INLINE_VECS) { - bvl = bio->bi_inline_vecs; - nr_iovecs = BIO_INLINE_VECS; - } else { + if (nr_iovecs > inline_vecs) { bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) goto err_free; - - nr_iovecs = bvec_nr_vecs(idx); + } else if (nr_iovecs) { + bvl = bio->bi_inline_vecs; } -out_set: + + bio->bi_pool = bs; bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bvl; @@ -315,80 +367,6 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); -static void bio_fs_destructor(struct bio *bio) -{ - bio_free(bio, fs_bio_set); -} - -/** - * bio_alloc - allocate a new bio, memory pool backed - * @gfp_mask: allocation mask to use - * @nr_iovecs: number of iovecs - * - * bio_alloc will allocate a bio and associated bio_vec array that can hold - * at least @nr_iovecs entries. Allocations will be done from the - * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. - * - * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate - * a bio. This is due to the mempool guarantees. To make this work, callers - * must never allocate more than 1 bio at a time from this pool. Callers - * that need to allocate more than 1 bio must always submit the previously - * allocated bio for IO before attempting to allocate a new one. Failure to - * do so can cause livelocks under memory pressure. - * - * RETURNS: - * Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); - - if (bio) - bio->bi_destructor = bio_fs_destructor; - - return bio; -} -EXPORT_SYMBOL(bio_alloc); - -static void bio_kmalloc_destructor(struct bio *bio) -{ - if (bio_integrity(bio)) - bio_integrity_free(bio, fs_bio_set); - kfree(bio); -} - -/** - * bio_kmalloc - allocate a bio for I/O using kmalloc() - * @gfp_mask: the GFP_ mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * - * Description: - * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains - * %__GFP_WAIT, the allocation is guaranteed to succeed. - * - **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - struct bio *bio; - - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - if (unlikely(!bio)) - return NULL; - - bio_init(bio); - bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bio->bi_inline_vecs; - bio->bi_destructor = bio_kmalloc_destructor; - - return bio; -} -EXPORT_SYMBOL(bio_kmalloc); - void zero_fill_bio(struct bio *bio) { unsigned long flags; @@ -419,11 +397,8 @@ void bio_put(struct bio *bio) /* * last put frees it */ - if (atomic_dec_and_test(&bio->bi_cnt)) { - bio_disassociate_task(bio); - bio->bi_next = NULL; - bio->bi_destructor(bio); - } + if (atomic_dec_and_test(&bio->bi_cnt)) + bio_free(bio); } EXPORT_SYMBOL(bio_put); @@ -465,26 +440,28 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) EXPORT_SYMBOL(__bio_clone); /** - * bio_clone - clone a bio + * bio_clone_bioset - clone a bio * @bio: bio to clone * @gfp_mask: allocation priority + * @bs: bio_set to allocate from * * Like __bio_clone, only also allocates the returned bio */ -struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, + struct bio_set *bs) { - struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); + struct bio *b; + b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); if (!b) return NULL; - b->bi_destructor = bio_fs_destructor; __bio_clone(b, bio); if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask); if (ret < 0) { bio_put(b); @@ -494,7 +471,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return b; } -EXPORT_SYMBOL(bio_clone); +EXPORT_SYMBOL(bio_clone_bioset); /** * bio_get_nr_vecs - return approx number of vecs @@ -1312,7 +1289,7 @@ EXPORT_SYMBOL(bio_copy_kern); * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, pdflush) could clean the pages if they are mapped + * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the @@ -1500,7 +1477,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); - BUG_ON(bi->bi_vcnt != 1); + BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); BUG_ON(bi->bi_idx != 0); atomic_set(&bp->cnt, 3); bp->error = 0; @@ -1510,17 +1487,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bio2.bi_size -= first_sectors << 9; bp->bio1.bi_size = first_sectors << 9; - bp->bv1 = bi->bi_io_vec[0]; - bp->bv2 = bi->bi_io_vec[0]; - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; + if (bi->bi_vcnt != 0) { + bp->bv1 = bi->bi_io_vec[0]; + bp->bv2 = bi->bi_io_vec[0]; + + if (bio_is_rw(bi)) { + bp->bv2.bv_offset += first_sectors << 9; + bp->bv2.bv_len -= first_sectors << 9; + bp->bv1.bv_len = first_sectors << 9; + } - bp->bio1.bi_io_vec = &bp->bv1; - bp->bio2.bi_io_vec = &bp->bv2; + bp->bio1.bi_io_vec = &bp->bv1; + bp->bio2.bi_io_vec = &bp->bv2; - bp->bio1.bi_max_vecs = 1; - bp->bio2.bi_max_vecs = 1; + bp->bio1.bi_max_vecs = 1; + bp->bio2.bi_max_vecs = 1; + } bp->bio1.bi_end_io = bio_pair_end_1; bp->bio2.bi_end_io = bio_pair_end_2; diff --git a/fs/block_dev.c b/fs/block_dev.c index c2bbe1fb132..172f8491a2b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -sector_t blkdev_max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); - } - return retval; -} - /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { @@ -163,52 +150,12 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= blkdev_max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = blkdev_max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - static ssize_t blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_blocks, NULL, NULL, 0); + nr_segs, blkdev_get_block, NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -374,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, * for a block special file file->f_path.dentry->d_inode->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ -static loff_t block_llseek(struct file *file, loff_t offset, int origin) +static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = file->f_mapping->host; loff_t size; @@ -384,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) size = i_size_read(bd_inode); retval = -EINVAL; - switch (origin) { + switch (whence) { case SEEK_END: offset += size; break; @@ -1578,10 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); + blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1590,10 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); +static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); + + if (pos >= size) + return 0; + + size -= pos; + if (size < INT_MAX) + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); + return generic_file_aio_read(iocb, iov, nr_segs, pos); +} + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1624,7 +1590,7 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = blkdev_fsync, @@ -1710,3 +1676,39 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) return res; } EXPORT_SYMBOL(__invalidate_device); + +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +{ + struct inode *inode, *old_inode = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + + func(I_BDEV(inode), arg); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); +} diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0c4fa2befae..7df3e0f0ee5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 761e2cd8fed..e15d2b0d8d3 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); } if (size > 0) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; @@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, return PTR_ERR(acl); if (acl == NULL) return -ENODATA; - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return ret; @@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; + if (ret == 0) + acl = NULL; } ret = 0; break; @@ -141,7 +143,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = posix_acl_to_xattr(acl, value, size); + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) goto out; } @@ -169,7 +171,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, return -EOPNOTSUPP; if (value) { - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 42704149b72..58b7d14b08e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers, work->ordered_func(work); - /* now take the lock again and call the freeing code */ + /* now take the lock again and drop our item from the list */ spin_lock(&workers->order_lock); list_del(&work->order_list); + spin_unlock(&workers->order_lock); + + /* + * we don't want to call the ordered free functions + * with the lock held though + */ work->ordered_free(work); + spin_lock(&workers->order_lock); } spin_unlock(&workers->order_lock); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a383c18e74e..04edf69be87 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "backref.h" @@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { ret = ulist_add(parents, eb->start, - (unsigned long)eie, GFP_NOFS); + (uintptr_t)eie, GFP_NOFS); if (ret < 0) break; if (!extent_item_pos) { @@ -282,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - rcu_read_lock(); - root_level = btrfs_header_level(root->node); - rcu_read_unlock(); + root_level = btrfs_old_root_level(root, time_seq); if (root_level + 1 == level) goto out; @@ -363,8 +362,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; - ref->inode_list = - node ? (struct extent_inode_elem *)node->aux : 0; + ref->inode_list = node ? + (struct extent_inode_elem *)(uintptr_t)node->aux : 0; /* additional parents require new refs being added here */ while ((node = ulist_next(parents, &uiter))) { @@ -375,8 +374,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; - new_ref->inode_list = - (struct extent_inode_elem *)node->aux; + new_ref->inode_list = (struct extent_inode_elem *) + (uintptr_t)node->aux; list_add(&new_ref->list, &ref->list); } ulist_reinit(parents); @@ -462,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode) pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; struct __prelim_ref *xchg; + struct extent_inode_elem *eie; ref2 = list_entry(pos2, struct __prelim_ref, list); @@ -473,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = ref2; ref2 = xchg; } - ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) continue; - ref1->count += ref2->count; } + + eie = ref1->inode_list; + while (eie && eie->next) + eie = eie->next; + if (eie) + eie->next = ref2->inode_list; + else + ref1->inode_list = ref2->inode_list; + ref1->count += ref2->count; + list_del(&ref2->list); kfree(ref2); } @@ -773,9 +781,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist *refs, struct ulist *roots, - const u64 *extent_item_pos) + u64 time_seq, struct ulist *refs, + struct ulist *roots, const u64 *extent_item_pos) { struct btrfs_key key; struct btrfs_path *path; @@ -837,7 +844,7 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, delayed_ref_seq, + ret = __add_delayed_refs(head, time_seq, &prefs_delayed); mutex_unlock(&head->mutex); if (ret) { @@ -892,8 +899,7 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); list_del(&ref->list); - if (ref->count < 0) - WARN_ON(1); + WARN_ON(ref->count < 0); if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); @@ -915,8 +921,8 @@ again: free_extent_buffer(eb); } ret = ulist_add_merge(refs, ref->parent, - (unsigned long)ref->inode_list, - (unsigned long *)&eie, GFP_NOFS); + (uintptr_t)ref->inode_list, + (u64 *)&eie, GFP_NOFS); if (!ret && extent_item_pos) { /* * we've recorded that parent, so we must extend @@ -960,7 +966,7 @@ static void free_leaf_list(struct ulist *blocks) while ((node = ulist_next(blocks, &uiter))) { if (!node->aux) continue; - eie = (struct extent_inode_elem *)node->aux; + eie = (struct extent_inode_elem *)(uintptr_t)node->aux; for (; eie; eie = eie_next) { eie_next = eie->next; kfree(eie); @@ -981,8 +987,7 @@ static void free_leaf_list(struct ulist *blocks) */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **leafs, + u64 time_seq, struct ulist **leafs, const u64 *extent_item_pos) { struct ulist *tmp; @@ -997,7 +1002,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, *leafs, tmp, extent_item_pos); ulist_free(tmp); @@ -1024,8 +1029,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **roots) + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1043,7 +1047,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); @@ -1111,44 +1115,97 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, found_key); } -/* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off) +{ + int ret, slot; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + unsigned long ptr; + + key.objectid = inode_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.offset = start_off; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If the item at offset is not found, + * btrfs_search_slot will point us to the slot + * where it should be inserted. In our case + * that will be the slot directly before the + * next INODE_REF_KEY_V2 item. In the case + * that we're pointing to the last slot in a + * leaf, we must move one leaf over. + */ + ret = btrfs_next_leaf(root, path); + if (ret) { + if (ret >= 1) + ret = -ENOENT; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* + * Check that we're still looking at an extended ref key for + * this particular objectid. If we have different + * objectid or type then there are no more to be found + * in the tree and we can exit. + */ + ret = -ENOENT; + if (found_key.objectid != inode_objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + break; + + ret = 0; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + extref = (struct btrfs_inode_extref *)ptr; + *ret_extref = extref; + if (found_off) + *found_off = found_key.offset; + break; + } + + return ret; +} + +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) { - u32 len; int slot; u64 next_inum; int ret; - s64 bytes_left = size - 1; + s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; int leave_spinning = path->leave_spinning; + struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; path->leave_spinning = 1; while (1) { - len = btrfs_inode_ref_name_len(eb, iref); - bytes_left -= len; + bytes_left -= name_len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, - (unsigned long)(iref + 1), len); + name_off, name_len); if (eb != eb_in) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1158,6 +1215,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, ret = -ENOENT; if (ret) break; + next_inum = found_key.offset; /* regular exit ahead */ @@ -1173,8 +1231,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); } btrfs_release_path(path); - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + name_len = btrfs_inode_ref_name_len(eb, iref); + name_off = (unsigned long)(iref + 1); + parent = next_inum; --bytes_left; if (bytes_left >= 0) @@ -1191,12 +1252,39 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } /* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +char *btrfs_iref_to_path(struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + return btrfs_ref_to_path(fs_root, path, + btrfs_inode_ref_name_len(eb_in, iref), + (unsigned long)(iref + 1), + eb_in, parent, dest, size); +} + +/* * this makes the path point to (logical EXTENT_ITEM *) * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for * tree blocks and <0 on error. */ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key) + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags_ret) { int ret; u64 flags; @@ -1240,10 +1328,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, (unsigned long long)found_key->objectid, (unsigned long long)found_key->offset, (unsigned long long)flags, item_size); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - return BTRFS_EXTENT_FLAG_TREE_BLOCK; - if (flags & BTRFS_EXTENT_FLAG_DATA) - return BTRFS_EXTENT_FLAG_DATA; + + WARN_ON(!flags_ret); + if (flags_ret) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else if (flags & BTRFS_EXTENT_FLAG_DATA) + *flags_ret = BTRFS_EXTENT_FLAG_DATA; + else + BUG_ON(1); + return 0; + } return -EIO; } @@ -1376,11 +1471,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list seq_elem = {}; struct seq_list tree_mod_seq_elem = {}; struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; - struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); @@ -1391,16 +1484,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, tree_mod_seq_elem.seq, &refs, + tree_mod_seq_elem.seq, &refs, &extent_item_pos); if (ret) goto out; @@ -1408,19 +1496,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - seq_elem.seq, - tree_mod_seq_elem.seq, &roots); + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu, data list " - "%#lx\n", root_node->val, ref_node->val, - ref_node->aux); - ret = iterate_leaf_refs( - (struct extent_inode_elem *)ref_node->aux, - root_node->val, extent_item_objectid, - iterate, ctx); + "%#llx\n", root_node->val, ref_node->val, + (long long)ref_node->aux); + ret = iterate_leaf_refs((struct extent_inode_elem *) + (uintptr_t)ref_node->aux, + root_node->val, + extent_item_objectid, + iterate, ctx); } ulist_free(roots); roots = NULL; @@ -1431,7 +1519,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); - btrfs_put_delayed_seq(delayed_refs, &seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } @@ -1444,16 +1531,16 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, { int ret; u64 extent_item_pos; + u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; - ret = extent_from_logical(fs_info, logical, path, - &found_key); + ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); btrfs_release_path(path); - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = -EINVAL; if (ret < 0) return ret; + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return -EINVAL; extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, @@ -1463,9 +1550,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx); + +static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) { int ret = 0; int slot; @@ -1482,7 +1572,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, while (!ret) { path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + &found_key); if (ret < 0) break; if (ret) { @@ -1510,7 +1600,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, "tree %llu\n", cur, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); - ret = iterate(parent, iref, eb, ctx); + ret = iterate(parent, name_len, + (unsigned long)(iref + 1), eb, ctx); if (ret) break; len = sizeof(*iref) + name_len; @@ -1525,12 +1616,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, return ret; } +static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u64 offset = 0; + u64 parent; + int found = 0; + struct extent_buffer *eb; + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u32 item_size; + u32 cur_offset; + unsigned long ptr; + + while (1) { + ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref, + &offset); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_release_path(path); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; + + while (cur_offset < item_size) { + u32 name_len; + + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + name_len = btrfs_inode_extref_name_len(eb, extref); + ret = iterate(parent, name_len, + (unsigned long)&extref->name, eb, ctx); + if (ret) + break; + + cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += sizeof(*extref); + } + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + + offset++; + } + + btrfs_release_path(path); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, iterate_irefs_t *iterate, + void *ctx) +{ + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; +} + /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ -static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx) +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx) { struct inode_fs_paths *ipath = ctx; char *fspath; @@ -1543,20 +1720,16 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, ipath->fspath->bytes_left - s_ptr : 0; fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, - inum, fspath_min, bytes_left); + fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len, + name_off, eb, inum, fspath_min, bytes_left); if (IS_ERR(fspath)) return PTR_ERR(fspath); if (fspath > fspath_min) { - pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { - pr_debug("missed path, not enough space. missing bytes: %lu, " - "constructed so far: %s\n", - (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; @@ -1578,7 +1751,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + inode_to_path, ipath); } struct btrfs_data_container *init_data_container(u32 total_bytes) @@ -1587,7 +1760,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = kmalloc(alloc_bytes, GFP_NOFS); + data = vmalloc(alloc_bytes); if (!data) return ERR_PTR(-ENOMEM); @@ -1638,6 +1811,6 @@ void free_ipath(struct inode_fs_paths *ipath) { if (!ipath) return; - kfree(ipath->fspath); + vfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index c18d8ac7b79..d61feca7945 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -21,6 +21,7 @@ #include "ioctl.h" #include "ulist.h" +#include "extent_io.h" #define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) @@ -32,14 +33,13 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx); int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, struct btrfs_path *path); int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key); + struct btrfs_path *path, struct btrfs_key *found_key, + u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_extent_item *ei, u32 item_size, @@ -58,12 +58,23 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **roots); + u64 time_seq, struct ulist **roots); +char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, struct extent_buffer *eb, + u64 parent, char *dest, u32 size); +char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); void free_ipath(struct inode_fs_paths *ipath); +int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, + u64 start_off, struct btrfs_path *path, + struct btrfs_inode_extref **ret_extref, + u64 *found_off); + #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 12394a90d60..2a8c242bc4f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -38,6 +38,8 @@ #define BTRFS_INODE_DELALLOC_META_RESERVED 4 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 +#define BTRFS_INODE_NEEDS_FULL_SYNC 7 +#define BTRFS_INODE_COPY_EVERYTHING 8 /* in memory btrfs inode */ struct btrfs_inode { @@ -87,11 +89,11 @@ struct btrfs_inode { /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; - /* the space_info for where this inode's data allocations are done */ - struct btrfs_space_info *space_info; - unsigned long runtime_flags; + /* Keep track of who's O_SYNC/fsycing currently */ + atomic_t sync_writers; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -146,6 +148,9 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; + /* a local copy of root's last_log_commit */ + unsigned long last_log_commit; + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent @@ -191,26 +196,24 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } -static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, - struct inode *inode) +static inline bool btrfs_is_free_space_inode(struct inode *inode) { - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (root == root->fs_info->tree_root && + btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) + return true; + if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) return true; return false; } static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); if (BTRFS_I(inode)->logged_trans == generation && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; + BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit) + return 1; + return 0; } #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index da6e9364a5e..11d47bfb62b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -37,8 +37,9 @@ * the file system was mounted, (i.e., they have been * referenced by the super block) or they have been * written since then and the write completion callback - * was called and a FLUSH request to the device where - * these blocks are located was received and completed. + * was called and no write error was indicated and a + * FLUSH request to the device where these blocks are + * located was received and completed. * 2b. All referenced blocks need to have a generation * number which is equal to the parent's number. * @@ -136,7 +137,7 @@ struct btrfsic_block { unsigned int never_written:1; /* block was added because it was * referenced, not because it was * written */ - unsigned int mirror_num:2; /* large enough to hold + unsigned int mirror_num; /* large enough to hold * BTRFS_SUPER_MIRROR_MAX */ struct btrfsic_dev_state *dev_state; u64 dev_bytenr; /* key, physical byte num on disk */ @@ -722,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -902,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1032,6 +1033,7 @@ continue_with_current_leaf_stack_frame: struct btrfs_disk_key *disk_key; u8 type; u32 item_offset; + u32 item_size; if (disk_item_offset + sizeof(struct btrfs_item) > sf->block_ctx->len) { @@ -1047,6 +1049,7 @@ leaf_item_out_of_bounce_error: disk_item_offset, sizeof(struct btrfs_item)); item_offset = le32_to_cpu(disk_item.offset); + item_size = le32_to_cpu(disk_item.size); disk_key = &disk_item.key; type = disk_key->type; @@ -1057,14 +1060,13 @@ leaf_item_out_of_bounce_error: root_item_offset = item_offset + offsetof(struct btrfs_leaf, items); - if (root_item_offset + - sizeof(struct btrfs_root_item) > + if (root_item_offset + item_size > sf->block_ctx->len) goto leaf_item_out_of_bounce_error; btrfsic_read_from_block_data( sf->block_ctx, &root_item, root_item_offset, - sizeof(struct btrfs_root_item)); + item_size); next_bytenr = le64_to_cpu(root_item.bytenr); sf->error = @@ -1285,7 +1287,7 @@ static int btrfsic_create_link_to_next_block( *next_blockp = NULL; if (0 == *num_copiesp) { *num_copiesp = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1487,7 +1489,7 @@ static int btrfsic_handle_extent_data( chunk_len = num_bytes; num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1580,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_device *device; length = len; - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + ret = btrfs_map_block(state->root->fs_info, READ, bytenr, &length, &multi, mirror_num); + if (ret) { + block_ctx_out->start = 0; + block_ctx_out->dev_bytenr = 0; + block_ctx_out->len = 0; + block_ctx_out->dev = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + return ret; + } + device = multi->stripes[0].dev; block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; @@ -1592,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - if (0 == ret) - kfree(multi); + kfree(multi); if (NULL == block_ctx_out->dev) { ret = -ENXIO; printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); @@ -2461,7 +2474,7 @@ static int btrfsic_process_written_superblock( } num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, + btrfs_num_copies(state->root->fs_info, next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2600,6 +2613,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, (unsigned long long)l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; + } else if (l->block_ref_to->iodone_w_error) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which has write error!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; } else if (l->parent_generation != l->block_ref_to->generation && BTRFSIC_GENERATION_UNKNOWN != @@ -2947,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, struct btrfsic_block_data_ctx block_ctx; int match = 0; - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(state->root->fs_info, bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 86eff48dab7..94ab2f80e7e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_start; struct extent_map *em; int ret = -ENOMEM; + int faili = 0; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!cb->compressed_pages[pg_index]) + if (!cb->compressed_pages[pg_index]) { + faili = pg_index - 1; + ret = -ENOMEM; goto fail2; + } } + faili = nr_pages - 1; cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -682,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); @@ -707,14 +713,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + bio_endio(comp_bio, ret); bio_put(comp_bio); return 0; fail2: - for (pg_index = 0; pg_index < nr_pages; pg_index++) - free_page((unsigned long)cb->compressed_pages[pg_index]); + while (faili >= 0) { + __free_page(cb->compressed_pages[faili]); + faili--; + } kfree(cb->compressed_pages); fail1: @@ -818,6 +827,7 @@ static void free_workspace(int type, struct list_head *workspace) btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(alloc_workspace); wake: + smp_mb(); if (waitqueue_active(workspace_wait)) wake_up(workspace_wait); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8206b390058..eea5da7a2b9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log); + struct btrfs_path *path, int level, int slot); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, @@ -321,7 +320,7 @@ struct tree_mod_root { struct tree_mod_elem { struct rb_node node; u64 index; /* shifted logical */ - struct seq_list elem; + u64 seq; enum mod_log_op op; /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ @@ -341,20 +340,50 @@ struct tree_mod_elem { struct tree_mod_root old_root; }; -static inline void -__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) { - elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + read_lock(&fs_info->tree_mod_log_lock); } -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) +static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) +{ + read_unlock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) +{ + write_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) +{ + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) { - elem->flags = 1; + u64 seq; + + tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); - __get_tree_mod_seq(fs_info, elem); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + } + seq = btrfs_inc_tree_mod_seq(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); + + return seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -371,41 +400,40 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - BUG_ON(!(elem->flags & 1)); spin_lock(&fs_info->tree_mod_seq_lock); list_del(&elem->list); + elem->seq = 0; list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (cur_elem->seq < min_seq) { if (seq_putting > cur_elem->seq) { /* * blocker with lower sequence number exists, we * cannot remove anything from the log */ - goto out; + spin_unlock(&fs_info->tree_mod_seq_lock); + return; } min_seq = cur_elem->seq; } } + spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); + tree_mod_log_write_lock(fs_info); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = container_of(node, struct tree_mod_elem, node); - if (tm->elem.seq > min_seq) + if (tm->seq > min_seq) continue; rb_erase(node, tm_root); - list_del(&tm->elem.list); kfree(tm); } - write_unlock(&fs_info->tree_mod_log_lock); -out: - spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); } /* @@ -423,11 +451,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; - BUG_ON(!tm || !tm->elem.seq); + BUG_ON(!tm || !tm->seq); - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; while (*new) { @@ -437,88 +463,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->index > tm->index) new = &((*new)->rb_right); - else if (cur->elem.seq < tm->elem.seq) + else if (cur->seq < tm->seq) new = &((*new)->rb_left); - else if (cur->elem.seq > tm->elem.seq) + else if (cur->seq > tm->seq) new = &((*new)->rb_right); else { kfree(tm); - ret = -EEXIST; - goto unlock; + return -EEXIST; } } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -unlock: - write_unlock(&fs_info->tree_mod_log_lock); - return ret; + return 0; } +/* + * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it + * returns zero with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * call tree_mod_log_write_unlock() to release. + */ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { smp_mb(); if (list_empty(&(fs_info)->tree_mod_seq_list)) return 1; - if (!eb) - return 0; - if (btrfs_header_level(eb) == 0) + if (eb && btrfs_header_level(eb) == 0) + return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list when no blocker exists. + */ + tree_mod_log_write_unlock(fs_info); return 1; + } + return 0; } /* - * This allocates memory and gets a tree modification sequence number when - * needed. + * This allocates memory and gets a tree modification sequence number. * - * Returns 0 when no sequence number is needed, < 0 on error. - * Returns 1 when a sequence number was added. In this case, - * fs_info->tree_mod_seq_lock was acquired and must be released by the caller - * after inserting into the rb tree. + * Returns <0 on error. + * Returns >0 (the added sequence number) on success. */ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { struct tree_mod_elem *tm; - int seq; - - if (tree_mod_dont_log(fs_info, NULL)) - return 0; - tm = *tm_ret = kzalloc(sizeof(*tm), flags); + /* + * once we switch from spin locks to something different, we should + * honor the flags parameter here. + */ + tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) return -ENOMEM; - tm->elem.flags = 0; - spin_lock(&fs_info->tree_mod_seq_lock); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list, because no blocker exists. items - * are removed from the list only when the existing blocker is - * removed from the list. - */ - kfree(tm); - seq = 0; - spin_unlock(&fs_info->tree_mod_seq_lock); - } else { - __get_tree_mod_seq(fs_info, &tm->elem); - seq = tm->elem.seq; - } - - return seq; + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + return tm->seq; } -static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +static inline int +__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { - struct tree_mod_elem *tm; int ret; + struct tree_mod_elem *tm; ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) + if (ret < 0) return ret; tm->index = eb->start >> PAGE_CACHE_SHIFT; @@ -530,8 +549,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); - ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + int ret; + + if (tree_mod_dont_log(fs_info, eb)) + return 0; + + ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + + tree_mod_log_write_unlock(fs_info); return ret; } @@ -543,6 +576,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } static noinline int +tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op) +{ + return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items, gfp_t flags) @@ -554,15 +595,20 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, eb)) return 0; + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, MOD_LOG_KEY_REMOVE_WHILE_MOVING); BUG_ON(ret < 0); } ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (ret < 0) + goto out; tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; @@ -571,10 +617,29 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->op = MOD_LOG_MOVE_KEYS; ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); +out: + tree_mod_log_write_unlock(fs_info); return ret; } +static inline void +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +{ + int i; + u32 nritems; + int ret; + + if (btrfs_header_level(eb) == 0) + return; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key_locked(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, @@ -583,9 +648,12 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm; int ret; + if (tree_mod_dont_log(fs_info, NULL)) + return 0; + ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (ret < 0) + goto out; tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -594,7 +662,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->op = MOD_LOG_ROOT_REPLACE; ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); +out: + tree_mod_log_write_unlock(fs_info); return ret; } @@ -608,7 +677,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct tree_mod_elem *found = NULL; u64 index = start >> PAGE_CACHE_SHIFT; - read_lock(&fs_info->tree_mod_log_lock); + tree_mod_log_read_lock(fs_info); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { @@ -617,18 +686,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, node = node->rb_left; } else if (cur->index > index) { node = node->rb_right; - } else if (cur->elem.seq < min_seq) { + } else if (cur->seq < min_seq) { node = node->rb_left; } else if (!smallest) { /* we want the node with the highest seq */ if (found) - BUG_ON(found->elem.seq > cur->elem.seq); + BUG_ON(found->seq > cur->seq); found = cur; node = node->rb_left; - } else if (cur->elem.seq > min_seq) { + } else if (cur->seq > min_seq) { /* we want the node with the smallest seq */ if (found) - BUG_ON(found->elem.seq < cur->elem.seq); + BUG_ON(found->seq < cur->seq); found = cur; node = node->rb_right; } else { @@ -636,7 +705,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, break; } } - read_unlock(&fs_info->tree_mod_log_lock); + tree_mod_log_read_unlock(fs_info); return found; } @@ -664,7 +733,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static inline void +static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) @@ -675,18 +744,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (tree_mod_dont_log(fs_info, NULL)) return; - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { + tree_mod_log_write_unlock(fs_info); return; + } - /* speed this up by single seq for all operations? */ for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, - MOD_LOG_KEY_REMOVE); + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); - ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, - MOD_LOG_KEY_ADD); + ret = tree_mod_log_insert_key_locked(fs_info, dst, + i + dst_offset, + MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } + + tree_mod_log_write_unlock(fs_info); } static inline void @@ -699,10 +773,9 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, BUG_ON(ret < 0); } -static inline void +static noinline void tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int slot, int atomic) + struct extent_buffer *eb, int slot, int atomic) { int ret; @@ -712,30 +785,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, BUG_ON(ret < 0); } -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static noinline void +tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - int i; - int ret; - u32 nritems; - if (tree_mod_dont_log(fs_info, eb)) return; - nritems = btrfs_header_nritems(eb); - for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); - BUG_ON(ret < 0); - } + __tree_mod_log_free_eb(fs_info, eb); + + tree_mod_log_write_unlock(fs_info); } -static inline void +static noinline void tree_mod_log_set_root_pointer(struct btrfs_root *root, struct extent_buffer *new_root_node) { int ret; - tree_mod_log_free_eb(root->fs_info, root->node); ret = tree_mod_log_insert_root(root->fs_info, root->node, new_root_node, GFP_NOFS); BUG_ON(ret < 0); @@ -862,12 +927,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - /* - * don't log freeing in case we're freeing the root node, this - * is done by tree_mod_log_set_root_pointer later - */ - if (buf != root->node && btrfs_header_level(buf) != 0) - tree_mod_log_free_eb(root->fs_info, buf); + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1069,7 +1129,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); - while (tm && tm->elem.seq >= time_seq) { + while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for * the modification. as we're going backwards, we do the @@ -1161,6 +1221,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, free_extent_buffer(eb); __tree_mod_log_rewind(eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); return eb_rewin; } @@ -1177,9 +1239,11 @@ get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; struct extent_buffer *eb; + struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; + u32 blocksize; eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); @@ -1195,14 +1259,32 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } tm = tree_mod_log_search(root->fs_info, logical, time_seq); - if (old_root) + if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + blocksize = btrfs_level_size(root, old_root->level); + old = read_tree_block(root, logical, blocksize, 0); + if (!old) { + pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", + logical); + WARN_ON(1); + } else { + eb = btrfs_clone_extent_buffer(old); + free_extent_buffer(old); + } + } else if (old_root) { + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); eb = alloc_dummy_extent_buffer(logical, root->nodesize); - else + } else { eb = btrfs_clone_extent_buffer(root->node); - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + } + if (!eb) return NULL; + extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1215,11 +1297,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq) __tree_mod_log_rewind(eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); - extent_buffer_get(eb); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); return eb; } +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { + level = tm->old_root.level; + } else { + rcu_read_lock(); + level = btrfs_header_level(root->node); + rcu_read_unlock(); + } + + return level; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -1260,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %llu running %llu\n", + if (trans->transaction != root->fs_info->running_transaction) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long) root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %llu running %llu\n", + + if (trans->transid != root->fs_info->generation) + WARN(1, KERN_CRIT "trans %llu running %llu\n", (unsigned long long)trans->transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -1368,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (cache_only && parent_level != 1) return 0; - if (trans->transaction != root->fs_info->running_transaction) - WARN_ON(1); - if (trans->transid != root->fs_info->generation) - WARN_ON(1); + WARN_ON(trans->transaction != root->fs_info->running_transaction); + WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); @@ -1661,6 +1755,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -1725,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1, 1); + del_ptr(trans, root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1734,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &right_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1769,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot, 1); + del_ptr(trans, root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -1778,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + tree_mod_log_set_node_key(root->fs_info, parent, pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1878,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot, 0); + pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1931,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); tree_mod_log_set_node_key(root->fs_info, parent, - &disk_key, pslot + 1, 0); + pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2117,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level, int no_skips = 0; struct extent_buffer *t; + if (path->really_keep_locks) + return; + for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; @@ -2164,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks) + if (path->keep_locks || path->really_keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2397,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (!cow) write_lock_level = -1; - if (cow && (p->keep_locks || p->lowest_level)) + if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; min_write_lock_level = write_lock_level; @@ -2466,7 +2564,10 @@ again: * must have write locks on this node and the * parent */ - if (level + 1 > write_lock_level) { + if (level > write_lock_level || + (level + 1 > write_lock_level && + level + 1 < BTRFS_MAX_LEVEL && + p->nodes[level + 1])) { write_lock_level = level + 1; btrfs_release_path(p); goto again; @@ -2722,6 +2823,80 @@ done: } /* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret < 0) + return ret; + if (!ret) { + p->slots[0] = btrfs_header_nritems(leaf) - 1; + return 0; + } + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } else { + --p->slots[0]; + } + } + return 0; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops @@ -2741,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); + tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -2832,8 +3007,10 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { - tree_mod_log_eb_move(root->fs_info, src, 0, push_items, - src_nritems - push_items); + /* + * don't call tree_mod_log_eb_move here, key removal was already + * fully logged by tree_mod_log_eb_copy above. + */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * @@ -3124,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, */ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { + struct btrfs_item *start_item; + struct btrfs_item *end_item; + struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - data_len = btrfs_item_end_nr(l, start); - data_len = data_len - btrfs_item_offset_nr(l, end); + btrfs_init_map_token(&token); + start_item = btrfs_item_nr(l, start); + end_item = btrfs_item_nr(l, end); + data_len = btrfs_token_item_offset(l, start_item, &token) + + btrfs_token_item_size(l, start_item, &token); + data_len = data_len - btrfs_token_item_offset(l, end_item, &token); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -3225,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (push_items == 0) goto out_unlock; - if (!empty && push_items == left_nritems) - WARN_ON(1); + WARN_ON(!empty && push_items == left_nritems); /* push left to right */ right_nritems = btrfs_header_nritems(right); @@ -3464,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) { - printk(KERN_CRIT "push items %d nr %u\n", push_items, + if (push_items > right_nritems) + WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, right_nritems); - WARN_ON(1); - } if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - @@ -4264,149 +4445,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, } /* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - * Returns the number of keys that were inserted. - */ -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - int ret = 0; - int slot; - int i; - u32 nritems; - u32 total_data = 0; - u32 total_size = 0; - unsigned int data_end; - struct btrfs_disk_key disk_key; - struct btrfs_key found_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - for (i = 0; i < nr; i++) { - if (total_size + data_size[i] + sizeof(struct btrfs_item) > - BTRFS_LEAF_DATA_SIZE(root)) { - break; - nr = i; - } - total_data += data_size[i]; - total_size += data_size[i] + sizeof(struct btrfs_item); - } - BUG_ON(nr == 0); - - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < total_size) { - for (i = nr; i >= 0; i--) { - total_data -= data_size[i]; - total_size -= data_size[i] + sizeof(struct btrfs_item); - if (total_size < btrfs_leaf_free_space(root, leaf)) - break; - } - nr = i; - } - - slot = path->slots[0]; - BUG_ON(slot < 0); - - if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* figure out how many keys we can insert in here */ - total_data = data_size[0]; - for (i = 1; i < nr; i++) { - if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) - break; - total_data += data_size[i]; - } - nr = i; - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); - } - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); - } - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - data_end = old_data; - } else { - /* - * this sucks but it has to be done, if we are inserting at - * the end of the leaf only insert 1 of the items, since we - * have no way of knowing whats on the next leaf and we'd have - * to drop our current locks to figure it out - */ - nr = 1; - } - - /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); - btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); - data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); - } - btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); - - ret = 0; - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); - } - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -out: - if (!ret) - ret = nr; - return ret; -} - -/* * this is a helper for btrfs_insert_empty_items, the main goal here is * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot @@ -4567,8 +4605,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot, - int tree_mod_log) + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4576,7 +4613,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { - if (tree_mod_log && level) + if (level) tree_mod_log_eb_move(root->fs_info, parent, slot, slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, @@ -4584,7 +4621,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); - } else if (tree_mod_log && level) { + } else if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); @@ -4621,7 +4658,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1], 1); + del_ptr(trans, root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -4931,6 +4968,434 @@ out: return ret; } +static void tree_move_down(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + BUG_ON(*level == 0); + path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], + path->slots[*level]); + path->slots[*level - 1] = 0; + (*level)--; +} + +static int tree_move_next_or_upnext(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_root *root, + struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(root, path, level, root_level); + } else { + tree_move_down(root, path, level, root_level); + ret = 0; + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_root *left_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + int ret; + int cmp; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_start_ctransid; + u64 right_start_ctransid; + u64 ctransid; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + spin_lock(&left_root->root_item_lock); + left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); + spin_unlock(&left_root->root_item_lock); + + spin_lock(&right_root->root_item_lock); + right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); + spin_unlock(&right_root->root_item_lock); + + trans = btrfs_join_transaction(left_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = left_root->commit_root; + extent_buffer_get(left_path->nodes[left_level]); + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = right_root->commit_root; + extent_buffer_get(right_path->nodes[right_level]); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + /* + * We need to make sure the transaction does not get committed + * while we do anything on commit roots. This means, we need to + * join and leave transactions for every item that we process. + */ + if (trans && btrfs_should_end_transaction(trans, left_root)) { + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + ret = btrfs_end_transaction(trans, left_root); + trans = NULL; + if (ret < 0) + goto out; + } + /* now rejoin the transaction */ + if (!trans) { + trans = btrfs_join_transaction(left_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + + spin_lock(&left_root->root_item_lock); + ctransid = btrfs_root_ctransid(&left_root->root_item); + spin_unlock(&left_root->root_item_lock); + if (ctransid != left_start_ctransid) + left_start_ctransid = 0; + + spin_lock(&right_root->root_item_lock); + ctransid = btrfs_root_ctransid(&right_root->root_item); + spin_unlock(&right_root->root_item_lock); + if (ctransid != right_start_ctransid) + right_start_ctransid = 0; + + if (!left_start_ctransid || !right_start_ctransid) { + WARN(1, KERN_WARNING + "btrfs: btrfs_compare_tree detected " + "a change in one of the trees while " + "iterating. This is probably a " + "bug.\n"); + ret = -EIO; + goto out; + } + + /* + * the commit root may have changed, so start again + * where we stopped + */ + left_path->lowest_level = left_level; + right_path->lowest_level = right_level; + ret = btrfs_search_slot(NULL, left_root, + &left_key, left_path, 0, 0); + if (ret < 0) + goto out; + ret = btrfs_search_slot(NULL, right_root, + &right_key, right_path, 0, 0); + if (ret < 0) + goto out; + } + + if (advance_left && !left_end_reached) { + ret = tree_advance(left_root, left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret < 0) + left_end_reached = ADVANCE; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_root, right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret < 0) + right_end_reached = ADVANCE; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_root, right_root, + left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_root, left_path, + right_path, tmp_buf); + if (ret) { + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_CHANGED, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kfree(tmp_buf); + + if (trans) { + if (!ret) + ret = btrfs_end_transaction(trans, left_root); + else + btrfs_end_transaction(trans, left_root); + } + + return ret; +} + /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the @@ -5033,6 +5498,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) return btrfs_next_old_leaf(root, path, 0); } +/* Release the path up to but not including the given level */ +static void btrfs_release_level(struct btrfs_path *path, int level) +{ + int i; + + for (i = 0; i < level; i++) { + path->slots[i] = 0; + if (!path->nodes[i]) + continue; + if (path->locks[i]) { + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } +} + +/* + * This function assumes 2 things + * + * 1) You are using path->keep_locks + * 2) You are not inserting items. + * + * If either of these are not true do not use this function. If you need a next + * leaf with either of these not being true then this function can be easily + * adapted to do that, but at the moment these are the limitations. + */ +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del) +{ + struct extent_buffer *b; + struct btrfs_key key; + u32 nritems; + int level = 1; + int slot; + int ret = 1; + int write_lock_level = BTRFS_MAX_LEVEL; + int ins_len = del ? -1 : 0; + + WARN_ON(!(path->keep_locks || path->really_keep_locks)); + + nritems = btrfs_header_nritems(path->nodes[0]); + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + while (path->nodes[level]) { + nritems = btrfs_header_nritems(path->nodes[level]); + if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { +search: + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, + ins_len, 1); + if (ret < 0) + goto out; + level = 1; + continue; + } + + if (path->slots[level] >= nritems - 1) { + level++; + continue; + } + + btrfs_release_level(path, level); + break; + } + + if (!path->nodes[level]) { + ret = 1; + goto out; + } + + path->slots[level]++; + b = path->nodes[level]; + + while (b) { + level = btrfs_header_level(b); + + if (!should_cow_block(trans, root, b)) + goto cow_done; + + btrfs_set_path_blocking(path); + ret = btrfs_cow_block(trans, root, b, + path->nodes[level + 1], + path->slots[level + 1], &b); + if (ret) + goto out; +cow_done: + path->nodes[level] = b; + btrfs_clear_path_blocking(path, NULL, 0); + if (level != 0) { + ret = setup_nodes_for_search(trans, root, path, b, + level, ins_len, + &write_lock_level); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + + b = path->nodes[level]; + slot = path->slots[level]; + + ret = read_block_for_search(trans, root, path, + &b, level, slot, &key, 0); + if (ret == -EAGAIN) + goto search; + if (ret) + goto out; + level = btrfs_header_level(b); + if (!btrfs_try_tree_write_lock(b)) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(path, b, + BTRFS_WRITE_LOCK); + } + path->locks[level] = BTRFS_WRITE_LOCK; + path->nodes[level] = b; + path->slots[level] = 0; + } else { + path->slots[level] = 0; + ret = 0; + break; + } + } + +out: + if (ret) + btrfs_release_path(path); + + return ret; +} + int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { @@ -5127,6 +5725,7 @@ again: * locked. To solve this situation, we give up * on our lock and cycle. */ + free_extent_buffer(next); btrfs_release_path(path); cond_resched(); goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fa5c45b3907..547b7b05727 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,7 @@ struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_MAX_MIRRORS 2 +#define BTRFS_MAX_MIRRORS 3 #define BTRFS_MAX_LEVEL 8 @@ -91,6 +91,9 @@ struct btrfs_ordered_sum; /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -113,7 +116,7 @@ struct btrfs_ordered_sum; #define BTRFS_FREE_SPACE_OBJECTID -11ULL /* - * The inode number assigned to the special inode for sotring + * The inode number assigned to the special inode for storing * free ino cache */ #define BTRFS_FREE_INO_OBJECTID -12ULL @@ -139,6 +142,8 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +#define BTRFS_DEV_REPLACE_DEVID 0 + /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -151,6 +156,13 @@ struct btrfs_ordered_sum; */ #define BTRFS_NAME_LEN 255 +/* + * Theoretical limit is larger, but we keep this down to a sane + * value. That should limit greatly the possibility of collisions on + * inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + /* 32 bytes in various csum fields */ #define BTRFS_CSUM_SIZE 32 @@ -162,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS (1 << 30) + #define BTRFS_FT_UNKNOWN 0 #define BTRFS_FT_REG_FILE 1 #define BTRFS_FT_DIR 2 @@ -403,7 +418,7 @@ struct btrfs_root_backup { __le64 bytes_used; __le64 num_devices; /* future */ - __le64 unsed_64[4]; + __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; @@ -486,6 +501,8 @@ struct btrfs_super_block { */ #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) + #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ @@ -493,7 +510,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) /* * A leaf is full of items. offset and size tell us where to find @@ -558,6 +576,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int really_keep_locks:1; }; /* @@ -640,6 +659,14 @@ struct btrfs_inode_ref { /* name goes here */ } __attribute__ ((__packed__)); +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[0]; + /* name goes here */ +} __attribute__ ((__packed__)); + struct btrfs_timespec { __le64 sec; __le32 nsec; @@ -709,6 +736,36 @@ struct btrfs_root_item { struct btrfs_disk_key drop_progress; u8 drop_level; u8 level; + + /* + * The following fields appear after subvol_uuids+subvol_times + * were introduced. + */ + + /* + * This generation number is used to test if the new fields are valid + * and up to date while reading the root item. Everytime the root item + * is written out, the "generation" field is copied into this field. If + * anyone ever mounted the fs with an older kernel, we will have + * mismatching generation values here and thus must invalidate the + * new fields. See btrfs_update_root and btrfs_find_last_root for + * details. + * the offset of generation_v2 is also used as the start for the memset + * when invalidating the fields. + */ + __le64 generation_v2; + u8 uuid[BTRFS_UUID_SIZE]; + u8 parent_uuid[BTRFS_UUID_SIZE]; + u8 received_uuid[BTRFS_UUID_SIZE]; + __le64 ctransid; /* updated when an inode changes */ + __le64 otransid; /* trans when created */ + __le64 stransid; /* trans when sent. non-zero for received subvol */ + __le64 rtransid; /* trans when received. non-zero for received subvol */ + struct btrfs_timespec ctime; + struct btrfs_timespec otime; + struct btrfs_timespec stime; + struct btrfs_timespec rtime; + __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); /* @@ -834,6 +891,59 @@ struct btrfs_dev_stats_item { __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; } __attribute__ ((__packed__)); +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace { + u64 replace_state; /* see #define above */ + u64 time_started; /* seconds since 1-Jan-1970 */ + u64 time_stopped; /* seconds since 1-Jan-1970 */ + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + u64 cont_reading_from_srcdev_mode; /* see #define above */ + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + pid_t lock_owner; + atomic_t nesting_level; + struct mutex lock_finishing_cancel_unmount; + struct mutex lock_management_lock; + struct mutex lock; + + struct btrfs_scrub_progress scrub_progress; +}; + +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -883,6 +993,72 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * SCANNING is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 scan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +/* flags definition for qgroup limits */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + struct btrfs_space_info { u64 flags; @@ -929,12 +1105,22 @@ struct btrfs_space_info { wait_queue_head_t wait; }; +#define BTRFS_BLOCK_RSV_GLOBAL 1 +#define BTRFS_BLOCK_RSV_DELALLOC 2 +#define BTRFS_BLOCK_RSV_TRANS 3 +#define BTRFS_BLOCK_RSV_CHUNK 4 +#define BTRFS_BLOCK_RSV_DELOPS 5 +#define BTRFS_BLOCK_RSV_EMPTY 6 +#define BTRFS_BLOCK_RSV_TEMP 7 + struct btrfs_block_rsv { u64 size; u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full; + unsigned short full; + unsigned short type; + unsigned short failfast; }; /* @@ -1028,8 +1214,18 @@ struct btrfs_block_group_cache { * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; + + /* For delayed block group creation */ + struct list_head new_bg_list; }; +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; +}; + +/* fs_info */ struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; @@ -1044,6 +1240,7 @@ struct btrfs_fs_info { struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; + struct btrfs_root *quota_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1133,7 +1330,6 @@ struct btrfs_fs_info { struct mutex reloc_mutex; struct list_head trans_list; - struct list_head hashers; struct list_head dead_roots; struct list_head caching_block_groups; @@ -1144,6 +1340,7 @@ struct btrfs_fs_info { spinlock_t tree_mod_seq_lock; atomic_t tree_mod_seq; struct list_head tree_mod_seq_list; + struct seq_list tree_mod_seq_elem; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -1195,6 +1392,7 @@ struct btrfs_fs_info { struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; + struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; @@ -1240,6 +1438,8 @@ struct btrfs_fs_info { */ struct list_head space_info; + struct btrfs_space_info *data_sinfo; + struct reloc_control *reloc_ctl; spinlock_t delalloc_lock; @@ -1256,9 +1456,6 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; - spinlock_t ref_cache_lock; - u64 total_ref_cache_size; - /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1292,10 +1489,35 @@ struct btrfs_fs_info { struct rw_semaphore scrub_super_lock; int scrub_workers_refcnt; struct btrfs_workers scrub_workers; + struct btrfs_workers scrub_wr_completion_workers; + struct btrfs_workers scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif + /* + * quota information + */ + unsigned int quota_enabled:1; + + /* + * quota_enabled only changes state after a commit. This holds the + * next state. + */ + unsigned int pending_quota_state:1; + + /* is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* holds configuration and tracking. Protected by qgroup_lock */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* list of dirty qgroups to be written at next commit */ + struct list_head dirty_qgroups; + + /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + u64 qgroup_seq; /* filesystem state */ u64 fs_state; @@ -1308,6 +1530,13 @@ struct btrfs_fs_info { /* next backup root to be overwritten */ int backup_root_index; + + int num_tolerated_disk_barrier_failures; + + /* device replace state */ + struct btrfs_dev_replace dev_replace; + + atomic_t mutually_exclusive_operation_running; }; /* @@ -1348,9 +1577,9 @@ struct btrfs_root { wait_queue_head_t log_commit_wait[2]; atomic_t log_writers; atomic_t log_commit[2]; + atomic_t log_batch; unsigned long log_transid; unsigned long last_log_commit; - unsigned long log_batch; pid_t log_start_pid; bool log_multiple_pids; @@ -1416,6 +1645,8 @@ struct btrfs_root { dev_t anon_dev; int force_cow; + + spinlock_t root_item_lock; }; struct btrfs_ioctl_defrag_range_args { @@ -1457,6 +1688,7 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_INODE_ITEM_KEY 1 #define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 #define BTRFS_XATTR_ITEM_KEY 24 #define BTRFS_ORPHAN_ITEM_KEY 48 /* reserve 2-15 close to the inode for later flexibility */ @@ -1525,6 +1757,30 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +/* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + #define BTRFS_BALANCE_ITEM_KEY 248 /* @@ -1534,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_STATS_KEY 249 /* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -1598,7 +1860,7 @@ struct btrfs_map_token { static inline void btrfs_init_map_token (struct btrfs_map_token *token) { - memset(token, 0, sizeof(*token)); + token->kaddr = NULL; } /* some macros to generate set/get funcs for the struct fields. This @@ -1621,13 +1883,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) offsetof(type, member), \ sizeof(((type *)0)->member))) -#ifndef BTRFS_SETGET_FUNCS +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, \ + struct btrfs_map_token *token); \ +void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val, \ + struct btrfs_map_token *token); \ +static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off) \ +{ \ + return btrfs_get_token_##bits(eb, ptr, off, NULL); \ +} \ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ +} + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); -#endif +static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \ +} \ +static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ + type *s, u##bits val, \ + struct btrfs_map_token *token) \ +{ \ + BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \ +} #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ @@ -1778,6 +2081,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags, BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); @@ -2189,6 +2499,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, + ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, + otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, + stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, + rtransid, 64); static inline bool btrfs_root_readonly(struct btrfs_root *root) { @@ -2465,6 +2785,92 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, + scan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; @@ -2605,10 +3011,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + /* + * Flushing delalloc may cause deadlock somewhere, in this + * case, use FLUSH LIMIT + */ + BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_ALL, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2622,24 +3041,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); @@ -2661,6 +3075,9 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2680,6 +3097,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, struct btrfs_path *path, int cache_only, u64 min_trans); +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, + struct btrfs_root *right_root, + struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); +int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t cb, void *ctx); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -2711,6 +3143,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, @@ -2753,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int del); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -2793,11 +3231,23 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info->quota_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kfree(fs_info); } +/* tree mod log functions from ctree.c */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic_inc_return(&fs_info->tree_mod_seq); +} +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -2819,6 +3269,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); +void btrfs_read_root_item(struct btrfs_root *root, + struct extent_buffer *eb, int slot, + struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); @@ -2826,8 +3279,12 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, + struct btrfs_root *root); /* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct inode *dir, @@ -2884,12 +3341,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); -struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod); +int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod, + u64 *ret_index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -2897,13 +3354,26 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); +struct btrfs_inode_extref * +btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int ins_len, + int cow); + +int btrfs_find_name_in_ext_backref(struct btrfs_path *path, + u64 ref_objectid, const char *name, + int name_len, + struct btrfs_inode_extref **extref_ret); + /* file-item.c */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 logical_offset, u32 *dst); + struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2914,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); +u64 btrfs_file_extent_length(struct btrfs_path *path); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); @@ -2929,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); @@ -2961,6 +3445,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len); +int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, + int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, @@ -2995,6 +3481,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); @@ -3020,16 +3508,30 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); +void btrfs_get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space); + /* file.c */ +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned); +void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, + u64 start, u64 end, int skip_pinned, + int modified); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache); +int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, u64 start, u64 end, + u64 *drop_end, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, u64 start, + u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -3053,14 +3555,48 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); + +#ifdef CONFIG_PRINTK +__printf(2, 3) void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); +#else +static inline __printf(2, 3) +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} +#endif + +__printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); + void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + } +} + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ + #define btrfs_abort_transaction(trans, root, errno) \ do { \ __btrfs_abort_transaction(trans, root, __func__, \ @@ -3080,6 +3616,7 @@ do { \ (errno), fmt, ##args); \ } while (0) +__printf(5, 6) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -3127,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly); +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_root *root); void btrfs_scrub_pause_super(struct btrfs_root *root); void btrfs_scrub_continue(struct btrfs_root *root); void btrfs_scrub_continue_super(struct btrfs_root *root); -int __btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel(struct btrfs_root *root); -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, + struct btrfs_device *dev); int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); @@ -3156,17 +3694,49 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* delayed seq elem */ -struct seq_list { +/* qgroup.c */ +struct qgroup_update { struct list_head list; - u64 seq; - u32 flags; + struct btrfs_delayed_ref_node *node; + struct btrfs_delayed_extent_op *extent_op; }; -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 2399f408691..34836036f01 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("delayed_node", + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, @@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); delayed_node->bytes_reserved = 0; + memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); } static inline int btrfs_is_continuous_delayed_item( @@ -511,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && + if (atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND && waitqueue_active(&delayed_root->wait)) wake_up(&delayed_root->wait); } @@ -649,8 +650,9 @@ static int btrfs_delayed_inode_reserve_metadata( * we're accounted for. */ if (!src_rsv || (!trans->bytes_reserved && - src_rsv != &root->fs_info->delalloc_block_rsv)) { - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); /* * Since we're under a transaction reserve_metadata_bytes could * try to commit the transaction which will make it return @@ -667,7 +669,7 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes, 1); } return ret; - } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { spin_lock(&BTRFS_I(inode)->lock); if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, &BTRFS_I(inode)->runtime_flags)) { @@ -685,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata( * reserve something strictly for us. If not be a pain and try * to steal from the delalloc block rsv. */ - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, + BTRFS_RESERVE_NO_FLUSH); if (!ret) goto out; @@ -1027,9 +1030,10 @@ do_again: btrfs_release_delayed_item(prev); ret = 0; btrfs_release_path(path); - if (curr) + if (curr) { + mutex_unlock(&node->mutex); goto do_again; - else + } else goto delete_fail; } @@ -1054,8 +1058,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < + if (atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && waitqueue_active(&delayed_root->wait)) wake_up(&delayed_root->wait); @@ -1113,8 +1116,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * Returns < 0 on error and returns with an aborted transaction with any * outstanding delayed items cleaned up. */ -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) { struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; @@ -1122,6 +1125,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_block_rsv *block_rsv; int ret = 0; + bool count = (nr > 0); if (trans->aborted) return -EIO; @@ -1137,7 +1141,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node) { + while (curr_node && (!count || (count && nr--))) { curr_root = curr_node->root; ret = btrfs_insert_delayed_items(trans, path, curr_root, curr_node); @@ -1149,6 +1153,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path, curr_node); if (ret) { btrfs_release_delayed_node(curr_node); + curr_node = NULL; btrfs_abort_transaction(trans, root, ret); break; } @@ -1158,12 +1163,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, btrfs_release_delayed_node(prev_node); } + if (curr_node) + btrfs_release_delayed_node(curr_node); btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; } +int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_run_delayed_items(trans, root, -1); +} + +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr) +{ + return __btrfs_run_delayed_items(trans, root, nr); +} + static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { @@ -1238,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1299,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) delayed_node); mutex_unlock(&delayed_node->mutex); - nr = trans->blocks_used; - trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); - __btrfs_btree_balance_dirty(root, nr); + btrfs_btree_balance_dirty_nodelay(root); free_path: btrfs_free_path(path); out: @@ -1698,8 +1714,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) { - btrfs_set_stack_inode_uid(inode_item, inode->i_uid); - btrfs_set_stack_inode_gid(inode_item, inode->i_gid); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); btrfs_set_stack_inode_mode(inode_item, inode->i_mode); btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); @@ -1747,8 +1763,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - inode->i_uid = btrfs_stack_inode_uid(inode_item); - inode->i_gid = btrfs_stack_inode_gid(inode_item); + i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f5aa4023d3e..4f808e1baee 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode); int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int nr); void btrfs_balance_delayed_items(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 13ae7b04790..ae941177339 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -38,17 +38,14 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, struct btrfs_delayed_tree_ref *ref1) { - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; return 0; } @@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, * type of the delayed backrefs and content of delayed backrefs. */ static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1) + struct btrfs_delayed_ref_node *ref1, + bool compare_seq) { if (ref1->bytenr < ref2->bytenr) return -1; @@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, if (ref1->type > ref2->type) return 1; /* merging of sequenced refs is not allowed */ - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; + if (compare_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), @@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, rb_node); - cmp = comp_entry(entry, ins); + cmp = comp_entry(entry, ins, 1); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) @@ -233,22 +233,134 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - u64 seq) +static void inline drop_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *ref) { - struct seq_list *elem; + rb_erase(&ref->rb_node, &delayed_refs->root); + ref->in_tree = 0; + btrfs_put_delayed_ref(ref); + delayed_refs->num_entries--; + if (trans->delayed_ref_updates) + trans->delayed_ref_updates--; +} - assert_spin_locked(&delayed_refs->lock); - if (list_empty(&delayed_refs->seq_head)) - return 0; +static int merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *ref, u64 seq) +{ + struct rb_node *node; + int merged = 0; + int mod = 0; + int done = 0; + + node = rb_prev(&ref->rb_node); + while (node) { + struct btrfs_delayed_ref_node *next; + + next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_prev(node); + if (next->bytenr != ref->bytenr) + break; + if (seq && next->seq >= seq) + break; + if (comp_entry(ref, next, 0)) + continue; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + struct btrfs_delayed_ref_node *tmp; - elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); - if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", - seq, elem->seq, delayed_refs); - return 1; + tmp = ref; + ref = next; + next = tmp; + done = 1; + } + mod = -next->ref_mod; + } + + merged++; + drop_delayed_ref(trans, delayed_refs, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, ref); + break; + } else { + /* + * You can't have multiples of the same ref on a tree + * block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (done) + break; + node = rb_prev(&ref->rb_node); } - return 0; + + return merged; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct rb_node *node; + u64 seq = 0; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + node = rb_prev(&head->node.rb_node); + while (node) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, + rb_node); + if (ref->bytenr != head->node.bytenr) + break; + + /* We can't merge refs that are outside of our seq count */ + if (seq && ref->seq >= seq) + break; + if (merge_ref(trans, delayed_refs, ref, seq)) + node = rb_prev(&head->node.rb_node); + else + node = rb_prev(node); + } +} + +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +{ + struct seq_list *elem; + int ret = 0; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %llu, lowest is " + "%llu (%p)\n", seq, elem->seq, delayed_refs); + ret = 1; + } + } + + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, @@ -332,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans, * every changing the extent allocation tree. */ existing->ref_mod--; - if (existing->ref_mod == 0) { - rb_erase(&existing->rb_node, - &delayed_refs->root); - existing->in_tree = 0; - btrfs_put_delayed_ref(existing); - delayed_refs->num_entries--; - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; - } else { + if (existing->ref_mod == 0) + drop_delayed_ref(trans, delayed_refs, existing); + else WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - } } else { WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || existing->type == BTRFS_SHARED_BLOCK_REF_KEY); @@ -525,8 +630,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (is_fstree(ref_root)) - seq = inc_delayed_seq(delayed_refs); + if (need_ref_seq(for_cow, ref_root)) + seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -584,8 +689,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (is_fstree(ref_root)) - seq = inc_delayed_seq(delayed_refs); + if (need_ref_seq(for_cow, ref_root)) + seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -658,10 +763,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!is_fstree(ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + if (need_ref_seq(for_cow, ref_root)) + btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -707,10 +811,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!is_fstree(ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + if (need_ref_seq(for_cow, ref_root)) + btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -736,8 +839,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 413927fb995..c9d703693df 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,7 +18,7 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ -/* these are the possible values of struct btrfs_delayed_ref->action */ +/* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ @@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; - - /* - * seq number of delayed refs. We need to know if a backref was being - * added before the currently processed ref or afterwards. - */ - u64 seq; - - /* - * seq_list holds a list of all seq numbers that are currently being - * added to the list. While walking backrefs (btrfs_find_all_roots, - * qgroups), which might take some time, no newer ref must be processed, - * as it might influence the outcome of the walk. - */ - struct list_head seq_head; - - /* - * when the only refs we have in the list must not be processed, we want - * to wait for more refs to show up or for the end of backref walking. - */ - wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -187,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); @@ -195,34 +179,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); -static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) -{ - assert_spin_locked(&delayed_refs->lock); - ++delayed_refs->seq; - return delayed_refs->seq; -} +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq); -static inline void -btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) +/* + * delayed refs with a ref_seq > 0 must be held back during backref walking. + * this only applies to items in one of the fs-trees. for_cow items never need + * to be held back, so they won't get a ref_seq number. + */ +static inline int need_ref_seq(int for_cow, u64 rootid) { - assert_spin_locked(&delayed_refs->lock); - elem->seq = delayed_refs->seq; - list_add_tail(&elem->list, &delayed_refs->seq_head); -} + if (for_cow) + return 0; -static inline void -btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - spin_lock(&delayed_refs->lock); - list_del(&elem->list); - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); -} + if (rootid == BTRFS_FS_TREE_OBJECTID) + return 1; -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - u64 seq); + if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + + return 0; +} /* * a node might live in a head or a regular ref, this lets you diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 00000000000..66dbc8dbddf --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,856 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/math64.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" + +static u64 btrfs_get_seconds_since_1970(void); +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_path *path = NULL; + int item_size; + struct btrfs_dev_replace_item *ptr; + u64 src_devid; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { +no_valid_dev_replace_entry_found: + ret = 0; + dev_replace->replace_state = + BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + dev_replace->cont_reading_from_srcdev_mode = + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; + dev_replace->replace_state = 0; + dev_replace->time_started = 0; + dev_replace->time_stopped = 0; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + dev_replace->is_valid = 0; + dev_replace->item_needs_writeback = 0; + goto out; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + + if (item_size != sizeof(struct btrfs_dev_replace_item)) { + pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); + goto no_valid_dev_replace_entry_found; + } + + src_devid = btrfs_dev_replace_src_devid(eb, ptr); + dev_replace->cont_reading_from_srcdev_mode = + btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); + dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); + dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); + dev_replace->time_stopped = + btrfs_dev_replace_time_stopped(eb, ptr); + atomic64_set(&dev_replace->num_write_errors, + btrfs_dev_replace_num_write_errors(eb, ptr)); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, + btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); + dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); + dev_replace->committed_cursor_left = dev_replace->cursor_left; + dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; + dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); + dev_replace->is_valid = 1; + + dev_replace->item_needs_writeback = 0; + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, + NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info, + BTRFS_DEV_REPLACE_DEVID, + NULL, NULL); + /* + * allow 'btrfs dev replace_cancel' if src/tgt device is + * missing + */ + if (!dev_replace->srcdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", + (unsigned long long)src_devid); + } + if (!dev_replace->tgtdev && + !btrfs_test_opt(dev_root, DEGRADED)) { + ret = -EIO; + pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", + (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + } + if (dev_replace->tgtdev) { + if (dev_replace->srcdev) { + dev_replace->tgtdev->total_bytes = + dev_replace->srcdev->total_bytes; + dev_replace->tgtdev->disk_total_bytes = + dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->bytes_used = + dev_replace->srcdev->bytes_used; + } + dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; + btrfs_init_dev_replace_tgtdev_for_resume(fs_info, + dev_replace->tgtdev); + } + break; + } + +out: + if (path) + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_replace_item *ptr; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + if (!dev_replace->is_valid || + !dev_replace->item_needs_writeback) { + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + key.objectid = 0; + key.type = BTRFS_DEV_REPLACE_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + pr_warn("btrfs: error %d while searching for dev_replace item!\n", + ret); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* + * need to delete old one and insert a new one. + * Since no attempt is made to recover any old state, if the + * dev_replace state is 'running', the data on the target + * drive is lost. + * It would be possible to recover the state: just make sure + * that the beginning of the item is never changed and always + * contains all the essential information. Then read this + * minimal set of information and use it as a base for the + * new state. + */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + pr_warn("btrfs: delete too small dev_replace item failed %d!\n", + ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + pr_warn("btrfs: insert dev_replace item failed %d!\n", + ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_dev_replace_item); + + btrfs_dev_replace_lock(dev_replace); + if (dev_replace->srcdev) + btrfs_set_dev_replace_src_devid(eb, ptr, + dev_replace->srcdev->devid); + else + btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); + btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, + dev_replace->cont_reading_from_srcdev_mode); + btrfs_set_dev_replace_replace_state(eb, ptr, + dev_replace->replace_state); + btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); + btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); + btrfs_set_dev_replace_num_write_errors(eb, ptr, + atomic64_read(&dev_replace->num_write_errors)); + btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, + atomic64_read(&dev_replace->num_uncorrectable_read_errors)); + dev_replace->cursor_left_last_write_of_item = + dev_replace->cursor_left; + btrfs_set_dev_replace_cursor_left(eb, ptr, + dev_replace->cursor_left_last_write_of_item); + btrfs_set_dev_replace_cursor_right(eb, ptr, + dev_replace->cursor_right); + dev_replace->item_needs_writeback = 0; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + + return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + dev_replace->committed_cursor_left = + dev_replace->cursor_left_last_write_of_item; +} + +static u64 btrfs_get_seconds_since_1970(void) +{ + struct timespec t = CURRENT_TIME_SEC; + + return t.tv_sec; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + struct btrfs_device *tgt_device = NULL; + struct btrfs_device *src_device = NULL; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + &tgt_device); + if (ret) { + pr_err("btrfs: target device %s is invalid!\n", + args->start.tgtdev_name); + mutex_unlock(&fs_info->volume_mutex); + return -EINVAL; + } + + ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + args->start.srcdev_name, + &src_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) { + ret = -EINVAL; + goto leave_no_lock; + } + + if (tgt_device->total_bytes < src_device->total_bytes) { + pr_err("btrfs: target device is smaller than source device!\n"); + ret = -EINVAL; + goto leave_no_lock; + } + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } + + dev_replace->cont_reading_from_srcdev_mode = + args->start.cont_reading_from_srcdev_mode; + WARN_ON(!src_device); + dev_replace->srcdev = src_device; + WARN_ON(!tgt_device); + dev_replace->tgtdev = tgt_device; + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + + /* + * from now on, the writes to the srcdev are all duplicated to + * go to the tgtdev as well (refer to btrfs_map_block()). + */ + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + dev_replace->time_started = btrfs_get_seconds_since_1970(); + dev_replace->cursor_left = 0; + dev_replace->committed_cursor_left = 0; + dev_replace->cursor_left_last_write_of_item = 0; + dev_replace->cursor_right = 0; + dev_replace->is_valid = 1; + dev_replace->item_needs_writeback = 1; + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + btrfs_dev_replace_unlock(dev_replace); + + btrfs_wait_ordered_extents(root, 0); + + /* force writing the updated state information to disk */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_lock(dev_replace); + goto leave; + } + + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* the disk copy procedure reuses the scrub code */ + ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, + src_device->total_bytes, + &dev_replace->scrub_progress, 0, 1); + + ret = btrfs_dev_replace_finishing(root->fs_info, ret); + WARN_ON(ret); + + return 0; + +leave: + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + return ret; +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, + int scrub_ret) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device; + struct btrfs_device *src_device; + struct btrfs_root *root = fs_info->tree_root; + u8 uuid_tmp[BTRFS_UUID_SIZE]; + struct btrfs_trans_handle *trans; + int ret = 0; + + /* don't allow cancel or unmount to disturb the finishing procedure */ + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + + btrfs_dev_replace_lock(dev_replace); + /* was the operation canceled, or is it finished? */ + if (dev_replace->replace_state != + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return 0; + } + + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_unlock(dev_replace); + + /* replace old device with new one in mapping tree */ + if (!scrub_ret) + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + + /* + * flush all outstanding I/O and inode extent mappings before the + * copy operation is declared as being finished + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + + /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + btrfs_dev_replace_lock(dev_replace); + dev_replace->replace_state = + scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED + : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + + if (scrub_ret) { + printk_in_rcu(KERN_ERR + "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; + } + + printk_in_rcu(KERN_INFO + "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); + tgt_device->is_tgtdev_for_dev_replace = 0; + tgt_device->devid = src_device->devid; + src_device->devid = BTRFS_DEV_REPLACE_DEVID; + tgt_device->bytes_used = src_device->bytes_used; + memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); + memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); + memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); + tgt_device->total_bytes = src_device->total_bytes; + tgt_device->disk_total_bytes = src_device->disk_total_bytes; + tgt_device->bytes_used = src_device->bytes_used; + if (fs_info->sb->s_bdev == src_device->bdev) + fs_info->sb->s_bdev = tgt_device->bdev; + if (fs_info->fs_devices->latest_bdev == src_device->bdev) + fs_info->fs_devices->latest_bdev = tgt_device->bdev; + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + if (src_device->bdev) { + /* zero out the old super */ + btrfs_scratch_superblock(src_device); + } + /* + * this is again a consistent state where no dev_replace procedure + * is running, the target device is part of the filesystem, the + * source device is not part of the filesystem anymore and its 1st + * superblock is scratched out so that it is no longer marked to + * belong to this filesystem. + */ + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + + /* write back the superblocks */ + trans = btrfs_start_transaction(root, 0); + if (!IS_ERR(trans)) + btrfs_commit_transaction(trans, root); + + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + + return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + /* even if !dev_replace_is_valid, the values are good enough for + * the replace_status ioctl */ + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + args->status.replace_state = dev_replace->replace_state; + args->status.time_started = dev_replace->time_started; + args->status.time_stopped = dev_replace->time_stopped; + args->status.num_write_errors = + atomic64_read(&dev_replace->num_write_errors); + args->status.num_uncorrectable_read_errors = + atomic64_read(&dev_replace->num_uncorrectable_read_errors); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + args->status.progress_1000 = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + args->status.progress_1000 = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, + div64_u64(dev_replace->srcdev->total_bytes, 1000)); + break; + } + btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args) +{ + args->result = __btrfs_dev_replace_cancel(fs_info); + return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *tgt_device = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = fs_info->tree_root; + u64 result; + int ret; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + btrfs_dev_replace_unlock(dev_replace); + goto leave; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + dev_replace->tgtdev = NULL; + dev_replace->srcdev = NULL; + break; + } + dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, root); + WARN_ON(ret); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->time_stopped = btrfs_get_seconds_since_1970(); + dev_replace->item_needs_writeback = 1; + pr_info("btrfs: suspending dev_replace for unmount\n"); + break; + } + + btrfs_dev_replace_unlock(dev_replace); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + + btrfs_dev_replace_lock(dev_replace); + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + btrfs_dev_replace_unlock(dev_replace); + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; + break; + } + if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { + pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" + "btrfs: you may cancel the operation after 'mount -o degraded'\n"); + btrfs_dev_replace_unlock(dev_replace); + return 0; + } + btrfs_dev_replace_unlock(dev_replace); + + WARN_ON(atomic_xchg( + &fs_info->mutually_exclusive_operation_running, 1)); + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); + return PTR_RET(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_ioctl_dev_replace_args *status_args; + u64 progress; + + status_args = kzalloc(sizeof(*status_args), GFP_NOFS); + if (status_args) { + btrfs_dev_replace_status(fs_info, status_args); + progress = status_args->status.progress_1000; + kfree(status_args); + do_div(progress, 10); + printk_in_rcu(KERN_INFO + "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + dev_replace->srcdev->missing ? "<missing disk>" : + rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? + rcu_str_deref(dev_replace->tgtdev->name) : + "<missing target disk>", + (unsigned int)progress); + } + btrfs_dev_replace_continue_on_mount(fs_info); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + + return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int ret; + + ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, + dev_replace->committed_cursor_left, + dev_replace->srcdev->total_bytes, + &dev_replace->scrub_progress, 0, 1); + ret = btrfs_dev_replace_finishing(fs_info, ret); + WARN_ON(ret); + return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ + if (!dev_replace->is_valid) + return 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + return 0; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * return true even if tgtdev is missing (this is + * something that can happen if the dev_replace + * procedure is suspended by an umount and then + * the tgtdev is missing (or "btrfs dev scan") was + * not called and the the filesystem is remounted + * in degraded state. This does not stop the + * dev_replace procedure. It needs to be canceled + * manually if the cancelation is wanted. + */ + break; + } + return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ + /* the beginning is just an optimization for the typical case */ + if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: + /* this is not a nested case where the same thread + * is trying to acqurire the same lock twice */ + mutex_lock(&dev_replace->lock); + mutex_lock(&dev_replace->lock_management_lock); + dev_replace->lock_owner = current->pid; + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_lock(&dev_replace->lock_management_lock); + if (atomic_read(&dev_replace->nesting_level) > 0 && + dev_replace->lock_owner == current->pid) { + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + atomic_inc(&dev_replace->nesting_level); + mutex_unlock(&dev_replace->lock_management_lock); + return; + } + + mutex_unlock(&dev_replace->lock_management_lock); + goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ + WARN_ON(!mutex_is_locked(&dev_replace->lock)); + mutex_lock(&dev_replace->lock_management_lock); + WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); + WARN_ON(dev_replace->lock_owner != current->pid); + atomic_dec(&dev_replace->nesting_level); + if (atomic_read(&dev_replace->nesting_level) == 0) { + dev_replace->lock_owner = 0; + mutex_unlock(&dev_replace->lock_management_lock); + mutex_unlock(&dev_replace->lock); + } else { + mutex_unlock(&dev_replace->lock_management_lock); + } +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 00000000000..20035cbbf02 --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) STRATO AG 2012. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ + atomic64_inc(stat_value); +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c1a074d0696..502c2158167 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const char *name, int name_len) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_item *di; + int data_size; + struct extent_buffer *leaf; + int slot; + struct btrfs_path *path; + + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + /* return back any errors */ + if (ret < 0) + goto out; + + /* nothing found, we're safe */ + if (ret > 0) { + ret = 0; + goto out; + } + + /* we found an item, look for our name in the item */ + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) { + /* our exact name was found */ + ret = -EEXIST; + goto out; + } + + /* + * see if there is room in the item to insert this + * name + */ + data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (data_size + btrfs_item_size_nr(leaf, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + ret = -EOVERFLOW; + } else { + /* plenty of insertion room */ + ret = 0; + } +out: + btrfs_free_path(path); + return ret; +} + /* * lookup a directory item based on index. 'dir' is the objectid * we're searching in, and 'mod' tells us if you plan on deleting the diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2936ca49b3b..a8f652dc940 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,11 @@ #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" +#include "dev-replace.h" + +#ifdef CONFIG_X86 +#include <asm/cpufeature.h> +#endif static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -217,26 +222,16 @@ static struct extent_map *btree_get_extent(struct inode *inode, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { - u64 failed_start = em->start; - u64 failed_len = em->len; - free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); - if (em) { - ret = 0; - } else { - em = lookup_extent_mapping(em_tree, failed_start, - failed_len); - ret = -EIO; - } + if (!em) + em = ERR_PTR(-EIO); } else if (ret) { free_extent_map(em); - em = NULL; + em = ERR_PTR(ret); } write_unlock(&em_tree->lock); - if (ret) - em = ERR_PTR(ret); out: return em; } @@ -377,9 +372,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ret = read_extent_buffer_pages(io_tree, eb, start, WAIT_COMPLETE, btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, + if (!ret) { + if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; + break; + else + ret = -EIO; + } /* * This buffer's crc is fine, but its contents are corrupted, so @@ -389,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + num_copies = btrfs_num_copies(root->fs_info, eb->start, eb->len); if (num_copies == 1) break; @@ -407,7 +406,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, break; } - if (failed && !ret) + if (failed && !ret && failed_mirror) repair_eb_io_failure(root, eb, failed_mirror); return ret; @@ -435,10 +434,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); return 0; } - if (eb->pages[0] != page) { - WARN_ON(1); - return 0; - } if (!PageUptodate(page)) { WARN_ON(1); return 0; @@ -754,9 +749,7 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - atomic_dec(&fs_info->nr_async_submits); - - if (atomic_read(&fs_info->nr_async_submits) < limit && + if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -860,21 +853,37 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int ret; + /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + if (ret) + bio_endio(bio, ret); + return ret; +} + +static int check_async_write(struct inode *inode, unsigned long bio_flags) +{ + if (bio_flags & EXTENT_BIO_TREE_LOG) + return 0; +#ifdef CONFIG_X86 + if (cpu_has_xmm4_2) + return 0; +#endif + return 1; } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + int async = check_async_write(inode, bio_flags); int ret; if (!(rw & REQ_WRITE)) { - /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -882,20 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1); if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else if (!async) { + ret = btree_csum_one_bio(bio); + if (ret) + goto out_w_error; + ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } else { + /* + * kthread helpers are used to submit writes so that + * checksumming can happen in parallel across all CPUs + */ + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + bio_offset, + __btree_submit_bio_start, + __btree_submit_bio_done); } - /* - * kthread helpers are used to submit writes so that checksumming - * can happen in parallel across all CPUs - */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, - bio_offset, - __btree_submit_bio_start, - __btree_submit_bio_done); + if (ret) { +out_w_error: + bio_endio(bio, ret); + } + return ret; } #ifdef CONFIG_MIGRATION @@ -980,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) static int btree_set_page_dirty(struct page *page) { +#ifdef DEBUG struct extent_buffer *eb; BUG_ON(!PagePrivate(page)); @@ -988,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_locked(eb); +#endif return __set_page_dirty_nobuffers(page); } @@ -1114,16 +1137,16 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, spin_unlock(&root->fs_info->delalloc_lock); btrfs_panic(root->fs_info, -EOVERFLOW, "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%lu)", + " dirty_mdatadata_bytes (%llu)", buf->len, root->fs_info->dirty_metadata_bytes); } spin_unlock(&root->fs_info->delalloc_lock); - } - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(buf); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); + } } } @@ -1166,8 +1189,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, @@ -1182,6 +1205,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; + + spin_lock_init(&root->root_item_lock); } static int __must_check find_and_setup_root(struct btrfs_root *tree_root, @@ -1225,6 +1250,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) return root; } +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + struct extent_buffer *leaf; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key key; + int ret = 0; + u64 bytenr; + + root = btrfs_alloc_root(fs_info); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + root->root_key.objectid = objectid; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + bytenr = leaf->start; + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + root->node = leaf; + + write_extent_buffer(leaf, fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + root->commit_root = btrfs_root_node(root); + root->track_dirty = 1; + + + root->root_item.flags = 0; + root->root_item.byte_limit = 0; + btrfs_set_root_bytenr(&root->root_item, leaf->start); + btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 1); + btrfs_set_root_used(&root->root_item, leaf->len); + btrfs_set_root_last_snapshot(&root->root_item, 0); + btrfs_set_root_dirid(&root->root_item, 0); + root->root_item.drop_level = 0; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + if (ret) + goto fail; + + btrfs_tree_unlock(leaf); + +fail: + if (ret) + return ERR_PTR(ret); + + return root; +} + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1326,6 +1427,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, u64 generation; u32 blocksize; int ret = 0; + int slot; root = btrfs_alloc_root(fs_info); if (!root) @@ -1352,9 +1454,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); if (ret == 0) { l = path->nodes[0]; - read_extent_buffer(l, &root->root_item, - btrfs_item_ptr_offset(l, path->slots[0]), - sizeof(root->root_item)); + slot = path->slots[0]; + btrfs_read_root_item(tree_root, l, slot, &root->root_item); memcpy(&root->root_key, location, sizeof(*location)); } btrfs_free_path(path); @@ -1396,6 +1497,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; + if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + return fs_info->quota_root ? fs_info->quota_root : + ERR_PTR(-ENOENT); again: spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, @@ -1533,8 +1637,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { btrfs_run_delayed_iputs(root); @@ -1566,7 +1668,6 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); @@ -1587,9 +1688,10 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { - cannot_commit = true; + if (PTR_ERR(trans) != -ENOENT) + cannot_commit = true; goto sleep; } if (transid == trans->transid) { @@ -1823,6 +1925,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_extent_buffer(info->extent_root->commit_root); free_extent_buffer(info->csum_root->node); free_extent_buffer(info->csum_root->commit_root); + if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); + } info->tree_root->node = NULL; info->tree_root->commit_root = NULL; @@ -1832,6 +1938,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) info->extent_root->commit_root = NULL; info->csum_root->node = NULL; info->csum_root->commit_root = NULL; + if (info->quota_root) { + info->quota_root->node = NULL; + info->quota_root->commit_root = NULL; + } if (chunk_root) { free_extent_buffer(info->chunk_root->node); @@ -1862,6 +1972,7 @@ int open_ctree(struct super_block *sb, struct btrfs_root *csum_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; + struct btrfs_root *quota_root; struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; @@ -1873,9 +1984,10 @@ int open_ctree(struct super_block *sb, csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); + quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info); if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root) { + !chunk_root || !dev_root || !quota_root) { err = -ENOMEM; goto fail; } @@ -1904,13 +2016,11 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); @@ -1924,12 +2034,15 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); - btrfs_init_block_rsv(&fs_info->global_block_rsv); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); - btrfs_init_block_rsv(&fs_info->trans_block_rsv); - btrfs_init_block_rsv(&fs_info->chunk_block_rsv); - btrfs_init_block_rsv(&fs_info->empty_block_rsv); - btrfs_init_block_rsv(&fs_info->delayed_block_rsv); + btrfs_init_block_rsv(&fs_info->global_block_rsv, + BTRFS_BLOCK_RSV_GLOBAL); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); + btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv, + BTRFS_BLOCK_RSV_DELOPS); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -2031,6 +2144,18 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->extent_commit_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + + spin_lock_init(&fs_info->qgroup_lock); + fs_info->qgroup_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2172,6 +2297,10 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), @@ -2243,8 +2372,9 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->delayed_workers); ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); + ret |= btrfs_start_workers(&fs_info->flush_workers); if (ret) { - ret = -ENOMEM; + err = -ENOMEM; goto fail_sb_buffer; } @@ -2311,7 +2441,11 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } - btrfs_close_extra_devices(fs_devices); + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_CRIT "btrfs: failed to read devices on %s\n", @@ -2356,6 +2490,17 @@ retry_root_backup: goto recovery_tree_root; csum_root->track_dirty = 1; + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_QUOTA_TREE_OBJECTID, quota_root); + if (ret) { + kfree(quota_root); + quota_root = fs_info->quota_root = NULL; + } else { + quota_root->track_dirty = 1; + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + } + fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2372,6 +2517,14 @@ retry_root_backup: goto fail_block_groups; } + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("btrfs: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_info, fs_devices, 1); + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2383,6 +2536,15 @@ retry_root_backup: printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable mount is not allowed\n"); + goto fail_block_groups; + } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); @@ -2415,17 +2577,19 @@ retry_root_backup: " integrity check module %s\n", sb->s_id); } #endif + ret = btrfs_read_qgroup_config(fs_info); + if (ret) + goto fail_trans_kthread; /* do not make disk changes in broken FS */ - if (btrfs_super_log_root(disk_super) != 0 && - !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { printk(KERN_WARNING "Btrfs log replay required " "on RO media\n"); err = -EIO; - goto fail_trans_kthread; + goto fail_qgroup; } blocksize = btrfs_level_size(tree_root, @@ -2434,7 +2598,7 @@ retry_root_backup: log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { err = -ENOMEM; - goto fail_trans_kthread; + goto fail_qgroup; } __setup_root(nodesize, leafsize, sectorsize, stripesize, @@ -2466,15 +2630,15 @@ retry_root_backup: if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) { - } + if (ret) + goto fail_trans_kthread; ret = btrfs_recover_relocation(tree_root); if (ret < 0) { printk(KERN_WARNING "btrfs: failed to recover relocation\n"); err = -EINVAL; - goto fail_trans_kthread; + goto fail_qgroup; } } @@ -2484,10 +2648,10 @@ retry_root_backup: fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) - goto fail_trans_kthread; + goto fail_qgroup; if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) @@ -2509,8 +2673,17 @@ retry_root_backup: return ret; } + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; + } + return 0; +fail_qgroup: + btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); fail_cleaner: @@ -2543,6 +2716,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->flush_workers); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2762,12 +2936,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait) printk_in_rcu("btrfs: disabling barriers on dev %s\n", rcu_str_deref(device->name)); device->nobarriers = 1; - } - if (!bio_flagged(bio, BIO_UPTODATE)) { + } else if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; - if (!bio_flagged(bio, BIO_EOPNOTSUPP)) - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ @@ -2806,14 +2978,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; - int errors = 0; + int errors_send = 0; + int errors_wait = 0; int ret; /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_send++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2821,13 +2994,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 0); if (ret) - errors++; + errors_send++; } /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { - errors++; + errors_wait++; continue; } if (!dev->in_fs_metadata || !dev->writeable) @@ -2835,13 +3008,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ret = write_dev_flush(dev, 1); if (ret) - errors++; + errors_wait++; } - if (errors) + if (errors_send > info->num_tolerated_disk_barrier_failures || + errors_wait > info->num_tolerated_disk_barrier_failures) return -EIO; return 0; } +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ioctl_space_info space; + struct btrfs_space_info *sinfo; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; + int i; + int c; + int num_tolerated_disk_barrier_failures = + (int)fs_info->fs_devices->num_devices; + + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + sinfo = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { + if (tmp->flags == types[i]) { + sinfo = tmp; + break; + } + } + rcu_read_unlock(); + + if (!sinfo) + continue; + + down_read(&sinfo->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&sinfo->block_groups[c])) { + u64 flags; + + btrfs_get_block_group_info( + &sinfo->block_groups[c], &space); + if (space.total_bytes == 0 || + space.used_bytes == 0) + continue; + flags = space.flags; + /* + * return + * 0: if dup, single or RAID0 is configured for + * any of metadata, system or data, else + * 1: if RAID5 is configured, or if RAID1 or + * RAID10 is configured and only two mirrors + * are used, else + * 2: if RAID6 is configured, else + * num_mirrors - 1: if RAID1 or RAID10 is + * configured and more than + * 2 mirrors are used. + */ + if (num_tolerated_disk_barrier_failures > 0 && + ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) + == 0))) + num_tolerated_disk_barrier_failures = 0; + else if (num_tolerated_disk_barrier_failures > 1 + && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) + num_tolerated_disk_barrier_failures = 1; + } + } + up_read(&sinfo->groups_sem); + } + + return num_tolerated_disk_barrier_failures; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2864,8 +3111,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; - if (do_barriers) - barrier_all_devices(root->fs_info); + if (do_barriers) { + ret = barrier_all_devices(root->fs_info); + if (ret) { + mutex_unlock( + &root->fs_info->fs_devices->device_list_mutex); + btrfs_error(root->fs_info, ret, + "errors while submitting device barriers."); + return ret; + } + } list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { @@ -3065,41 +3320,27 @@ int close_ctree(struct btrfs_root *root) smp_mb(); /* pause restriper - we want to resume on mount */ - btrfs_pause_balance(root->fs_info); + btrfs_pause_balance(fs_info); + + btrfs_dev_replace_suspend_for_unmount(fs_info); - btrfs_scrub_cancel(root); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(fs_info); + btrfs_cleanup_defrag_inodes(fs_info); - /* - * Here come 2 situations when btrfs is broken to flip readonly: - * - * 1. when btrfs flips readonly somewhere else before - * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, - * and btrfs will skip to write sb directly to keep - * ERROR state on disk. - * - * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannot write sb via btrfs_commit_super, - * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, - * btrfs will cleanup all FS resources first and write sb then. - */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - ret = btrfs_error_commit_super(root); - if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); - } + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3109,14 +3350,12 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); + btrfs_free_qgroup_config(root->fs_info); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); } - if (fs_info->total_ref_cache_size) { - printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", - (unsigned long long)fs_info->total_ref_cache_size); - } free_extent_buffer(fs_info->extent_root->node); free_extent_buffer(fs_info->extent_root->commit_root); @@ -3128,6 +3367,10 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->dev_root->commit_root); free_extent_buffer(fs_info->csum_root->node); free_extent_buffer(fs_info->csum_root->commit_root); + if (fs_info->quota_root) { + free_extent_buffer(fs_info->quota_root->node); + free_extent_buffer(fs_info->quota_root->commit_root); + } btrfs_free_block_groups(fs_info); @@ -3148,6 +3391,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) @@ -3192,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) int was_dirty; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) { - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + if (transid != root->fs_info->generation) + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", (unsigned long long)buf->start, (unsigned long long)transid, (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(&root->fs_info->delalloc_lock); @@ -3208,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) } } -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, + int flush_delayed) { /* * looks as though older kernels can get into trouble with @@ -3220,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) if (current->flags & PF_MEMALLOC) return; - btrfs_balance_delayed_items(root); + if (flush_delayed) + btrfs_balance_delayed_items(root); num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); + balance_dirty_pages_ratelimited( + root->fs_info->btree_inode->i_mapping); } return; } -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +void btrfs_btree_balance_dirty(struct btrfs_root *root) { - /* - * looks as though older kernels can get into trouble with - * this code, they end up stuck in balance_dirty_pages forever - */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (current->flags & PF_MEMALLOC) - return; - - num_dirty = root->fs_info->dirty_metadata_bytes; + __btrfs_btree_balance_dirty(root, 1); +} - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); - } - return; +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ + __btrfs_btree_balance_dirty(root, 0); } int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) @@ -3258,52 +3491,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } -static int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)) -{ - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *eb; - - /* - * We culled this eb but the page is still hanging out on the mapping, - * carry on. - */ - if (!PagePrivate(page)) - goto out; - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - goto out; - } - if (page != eb->pages[0]) - goto out; - - if (!btrfs_try_tree_write_lock(eb)) { - flush_fn(data); - btrfs_tree_lock(eb); - } - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - - btrfs_tree_unlock(eb); -out: - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } - return 0; -} - static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { @@ -3315,18 +3502,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (read_only) return 0; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - printk(KERN_WARNING "warning: mount fs with errors, " - "running btrfsck is recommended\n"); - } - return 0; } -int btrfs_error_commit_super(struct btrfs_root *root) +void btrfs_error_commit_super(struct btrfs_root *root) { - int ret; - mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -3336,10 +3516,6 @@ int btrfs_error_commit_super(struct btrfs_root *root) /* cleanup FS via transaction */ btrfs_cleanup_transaction(root); - - ret = write_ctree_super(NULL, root, 0); - - return ret; } static void btrfs_destroy_ordered_operations(struct btrfs_root *root) @@ -3517,7 +3693,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); + mark, NULL); if (ret) break; @@ -3572,7 +3748,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; @@ -3663,14 +3839,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) /* FIXME: cleanup wait for commit */ t->in_commit = 1; t->blocked = 1; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); t->blocked = 0; + smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); t->commit_done = 1; + smp_mb(); if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); @@ -3706,7 +3885,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) } static struct extent_io_ops btree_extent_io_ops = { - .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 05b3fab39f7..305c33efb0e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -int btrfs_error_commit_super(struct btrfs_root *root); +void btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, @@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -89,6 +89,14 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); +void btrfs_abort_devices(struct btrfs_root *root); +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid); +int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)); +int btrfs_calc_num_tolerated_disk_barrier_failures( + struct btrfs_fs_info *fs_info); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e1d36702ff..5a3327b8f90 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,9 @@ #include "volumes.h" #include "locking.h" #include "free-space-cache.h" +#include "math.h" + +#undef SCRAMBLE_DELAYED_REFS /* * control flags for do_chunk_alloc's force field @@ -92,8 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force); + struct btrfs_root *extent_root, u64 flags, + int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -310,7 +313,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, while (start < end) { ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE); + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); if (ret) break; @@ -646,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - do_div(num, 100); - return num; -} - u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner) { @@ -1832,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { @@ -2217,6 +2203,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int count = 0; int must_insert_reserved = 0; @@ -2249,13 +2236,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + */ + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); + + /* * locked_ref is the head node, so we have to go one * node back for any delayed ref updates */ ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { /* * there are still refs with lower seq numbers in the * process of being added. Don't run this ref yet. @@ -2300,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, kfree(extent_op); if (ret) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; @@ -2315,12 +2315,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - /* - * we modified num_entries, but as we're currently running - * delayed refs, skip - * wake_up(&delayed_refs->seq_wait); - * here. - */ + if (locked_ref) { + /* + * when we play the delayed ref, also correct the + * ref_mod on head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->node.ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->node.ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + } spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, @@ -2331,35 +2342,97 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, count++; if (ret) { + if (locked_ref) { + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + } printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; } next: - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } return count; } -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs, - struct list_head *first_seq) +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - spin_unlock(&delayed_refs->lock); - pr_debug("waiting for more refs (num %ld, first %p)\n", - num_refs, first_seq); - wait_event(delayed_refs->seq_wait, - num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); - pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); - spin_lock(&delayed_refs->lock); + struct qgroup_update *qgroup_update; + int ret = 0; + + if (list_empty(&trans->qgroup_ref_list) != + !trans->delayed_ref_elem.seq) { + /* list without seq or seq without list */ + printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); + } + + if (!trans->delayed_ref_elem.seq) + return 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + qgroup_update = list_first_entry(&trans->qgroup_ref_list, + struct qgroup_update, list); + list_del(&qgroup_update->list); + if (!ret) + ret = btrfs_qgroup_account_ref( + trans, fs_info, qgroup_update->node, + qgroup_update->extent_op); + kfree(qgroup_update); + } + + btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + + return ret; } /* @@ -2379,13 +2452,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct list_head cluster; - struct list_head *first_seq = NULL; int ret; u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - unsigned long num_refs = 0; - int consider_waiting; + int loops; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2394,15 +2465,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: - consider_waiting = 0; + loops = 0; spin_lock(&delayed_refs->lock); + +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; @@ -2424,31 +2498,6 @@ again: if (ret) break; - if (delayed_start >= delayed_refs->run_delayed_start) { - if (consider_waiting == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked) and if the number of - * refs doesn't change, we avoid busy waiting. - */ - consider_waiting = 1; - num_refs = delayed_refs->num_entries; - first_seq = root->fs_info->tree_mod_seq_list.next; - } else { - wait_for_more_refs(delayed_refs, - num_refs, first_seq); - /* - * after waiting, things have changed. we - * dropped the lock and someone else might have - * run some refs, built new clusters and so on. - * therefore, we restart staleness detection. - */ - consider_waiting = 0; - } - } - ret = run_clustered_refs(trans, root, &cluster); if (ret < 0) { spin_unlock(&delayed_refs->lock); @@ -2461,13 +2510,36 @@ again: if (count == 0) break; - if (ret || delayed_refs->run_delayed_start == 0) { + if (delayed_start >= delayed_refs->run_delayed_start) { + if (loops == 0) { + /* + * btrfs_find_ref_cluster looped. let's do one + * more cycle. if we don't run any delayed ref + * during that cycle (because we can't because + * all of them are blocked), bail out. + */ + loops = 1; + } else { + /* + * no runnable refs left, stop trying + */ + BUG_ON(run_all); + break; + } + } + if (ret) { /* refs were run, let's reset staleness detection */ - consider_waiting = 0; + loops = 0; } } if (run_all) { + if (!list_empty(&trans->new_bgs)) { + spin_unlock(&delayed_refs->lock); + btrfs_create_pending_block_groups(trans, root); + spin_lock(&delayed_refs->lock); + } + node = rb_first(&delayed_refs->root); if (!node) goto out; @@ -2502,6 +2574,7 @@ again: } out: spin_unlock(&delayed_refs->lock); + assert_qgroups_uptodate(trans); return 0; } @@ -2581,8 +2654,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, node = rb_prev(node); if (node) { + int seq = ref->seq; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) + if (ref->bytenr == bytenr && ref->seq == seq) goto out_unlock; } @@ -2903,25 +2978,29 @@ again: } spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED) { - /* We're not cached, don't bother trying to write stuff out */ + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(root, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option. + */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; } spin_unlock(&block_group->lock); - num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; - /* - * Just to make absolutely sure we have enough space, we're going to - * preallocate 12 pages worth of space for each block group. In - * practice we ought to use at most 8, but we need extra space so we can - * add our header and have a terminator between the extents and the - * bitmaps. - */ num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; @@ -3134,6 +3213,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = found; return 0; } @@ -3263,12 +3344,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return get_alloc_profile(root, flags); } -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - BTRFS_BLOCK_GROUP_DATA); -} - /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. @@ -3277,6 +3352,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 used; int ret = 0, committed = 0, alloc_chunk = 1; @@ -3289,7 +3365,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) committed = 1; } - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = fs_info->data_sinfo; if (!data_sinfo) goto alloc; @@ -3319,7 +3395,6 @@ alloc: return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, alloc_target, CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); @@ -3330,10 +3405,9 @@ alloc: goto commit_trans; } - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } + if (!data_sinfo) + data_sinfo = fs_info->data_sinfo; + goto again; } @@ -3380,7 +3454,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - data_sinfo = BTRFS_I(inode)->space_info; + data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", @@ -3402,8 +3476,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes, - int force) + struct btrfs_space_info *sinfo, int force) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3418,7 +3491,8 @@ static int should_alloc_chunk(struct btrfs_root *root, * and purposes it's used space. Don't worry about locking the * global_rsv, it doesn't change except when the transaction commits. */ - num_allocated += global_rsv->size; + if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) + num_allocated += global_rsv->size; /* * in limited mode, we want to have some free space up to @@ -3432,15 +3506,8 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - - /* 256MB or 2% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - /* system chunks need a much small threshold */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) - thresh = 32 * 1024 * 1024; - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) + if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3490,8 +3557,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, } static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) + struct btrfs_root *extent_root, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; @@ -3515,7 +3581,7 @@ again: return 0; } - if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { + if (!should_alloc_chunk(extent_root, space_info, force)) { spin_unlock(&space_info->lock); return 0; } else if (space_info->chunk_alloc) { @@ -3583,92 +3649,125 @@ out: return ret; } +static int can_overcommit(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + u64 used; + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + spin_unlock(&root->fs_info->free_chunk_lock); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, + unsigned long nr_pages, + enum wb_reason reason) +{ + if (!writeback_in_progress(sb->s_bdi) && + down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, reason); + up_read(&sb->s_umount); + return 1; + } + + return 0; +} + /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; - u64 reserved; + u64 delalloc_bytes; u64 max_reclaim; - u64 reclaimed = 0; long time_left; unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; - unsigned long progress; + enum btrfs_reserve_flush_enum flush; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_may_use; - progress = space_info->reservation_progress; - - if (reserved == 0) - return 0; - - smp_mb(); - if (root->fs_info->delalloc_bytes == 0) { + delalloc_bytes = root->fs_info->delalloc_bytes; + if (delalloc_bytes == 0) { if (trans) - return 0; - btrfs_wait_ordered_extents(root, 0, 0); - return 0; + return; + btrfs_wait_ordered_extents(root, 0); + return; } - max_reclaim = min(reserved, to_reclaim); - nr_pages = max_t(unsigned long, nr_pages, - max_reclaim >> PAGE_CACHE_SHIFT); - while (loops < 1024) { - /* have the flusher threads jump in and do some IO */ - smp_mb(); - nr_pages = min_t(unsigned long, nr_pages, - root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); + while (delalloc_bytes && loops < 3) { + max_reclaim = min(delalloc_bytes, to_reclaim); + nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, + nr_pages, + WB_REASON_FS_FREE_SPACE); + + /* + * We need to wait for the async pages to actually start before + * we do anything. + */ + wait_event(root->fs_info->async_submit_wait, + !atomic_read(&root->fs_info->async_delalloc_pages)); + if (!trans) + flush = BTRFS_RESERVE_FLUSH_ALL; + else + flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (reserved > space_info->bytes_may_use) - reclaimed += reserved - space_info->bytes_may_use; - reserved = space_info->bytes_may_use; + if (can_overcommit(root, space_info, orig, flush)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; - - if (reserved == 0 || reclaimed >= max_reclaim) - break; - - if (trans && trans->transaction->blocked) - return -EAGAIN; - if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); } else { - time_left = schedule_timeout_interruptible(1); - - /* We were interrupted, exit */ + time_left = schedule_timeout_killable(1); if (time_left) break; } - - /* we've kicked the IO a few times, if anything has been freed, - * exit. There is no sense in looping here for a long time - * when we really need to commit the transaction, or there are - * just too many writers without enough free space - */ - - if (loops > 3) { - smp_mb(); - if (progress != space_info->reservation_progress) - break; - } - + smp_mb(); + delalloc_bytes = root->fs_info->delalloc_bytes; } - - return reclaimed >= to_reclaim; } /** @@ -3728,12 +3827,78 @@ commit: return btrfs_commit_transaction(trans, root); } +enum flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + +static int flush_space(struct btrfs_root *root, + struct btrfs_space_info *space_info, u64 num_bytes, + u64 orig_bytes, int state) +{ + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) { + u64 bytes = btrfs_calc_trans_metadata_size(root, 1); + + nr = (int)div64_u64(num_bytes, bytes); + if (!nr) + nr = 1; + nr *= 2; + } else { + nr = -1; + } + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, root, nr); + btrfs_end_transaction(trans, root); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(root, num_bytes, orig_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case ALLOC_CHUNK: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans, root); + if (ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + ret = may_commit_transaction(root, space_info, orig_bytes, 0); + break; + default: + ret = -ENOSPC; + break; + } + + return ret; +} /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want - * @flush - wether or not we can flush to make our reservation + * @flush - whether or not we can flush to make our reservation * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3744,25 +3909,25 @@ commit: */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *space_info = block_rsv->space_info; u64 used; u64 num_bytes = orig_bytes; - int retries = 0; + int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool committed = false; bool flushing = false; - bool wait_ordered = false; again: ret = 0; spin_lock(&space_info->lock); /* - * We only want to wait if somebody other than us is flushing and we are - * actually alloed to flush. + * We only want to wait if somebody other than us is flushing and we + * are actually allowed to flush all things. */ - while (flush && !flushing && space_info->flush) { + while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && + space_info->flush) { spin_unlock(&space_info->lock); /* * If we have a trans handle we can't wait because the flusher @@ -3812,111 +3977,57 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - wait_ordered = true; num_bytes = used - space_info->total_bytes + - (orig_bytes * (retries + 1)); + (orig_bytes * 2); } - if (ret) { - u64 profile = btrfs_get_alloc_profile(root, 0); - u64 avail; - - /* - * If we have a lot of space that's pinned, don't bother doing - * the overcommit dance yet and just commit the transaction. - */ - avail = (space_info->total_bytes - space_info->bytes_used) * 8; - do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !committed) { - space_info->flush = 1; - flushing = true; - spin_unlock(&space_info->lock); - ret = may_commit_transaction(root, space_info, - orig_bytes, 1); - if (ret) - goto out; - committed = true; - goto again; - } - - spin_lock(&root->fs_info->free_chunk_lock); - avail = root->fs_info->free_chunk_space; - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. - */ - if (profile & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - avail >>= 1; - - /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. - */ - if (flush) - avail >>= 3; - else - avail >>= 1; - spin_unlock(&root->fs_info->free_chunk_lock); - - if (used + num_bytes < space_info->total_bytes + avail) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - wait_ordered = true; - } + if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; } /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else * stealing it from us. + * + * We make the other tasks wait for the flush only when we can flush + * all things. */ - if (ret && flush) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret || !flush) - goto out; - - /* - * We do synchronous shrinking since we don't actually unreserve - * metadata until after the IO is completed. - */ - ret = shrink_delalloc(root, num_bytes, wait_ordered); - if (ret < 0) + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) goto out; - ret = 0; + ret = flush_space(root, space_info, num_bytes, orig_bytes, + flush_state); + flush_state++; /* - * So if we were overcommitted it's possible that somebody else flushed - * out enough space and we simply didn't have enough space to reclaim, - * so go back around and try again. + * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock + * would happen. So skip delalloc flush. */ - if (retries < 2) { - wait_ordered = true; - retries++; - goto again; - } + if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT)) + flush_state = ALLOC_CHUNK; - ret = -ENOSPC; - if (committed) - goto out; - - ret = may_commit_transaction(root, space_info, orig_bytes, 0); - if (!ret) { - committed = true; + if (!ret) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && + flush_state < COMMIT_TRANS) + goto again; + else if (flush == BTRFS_RESERVE_FLUSH_ALL && + flush_state <= COMMIT_TRANS) goto again; - } out: if (flushing) { @@ -3934,7 +4045,10 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows || root == root->fs_info->csum_root) + if (root->ref_cows) + block_rsv = trans->block_rsv; + + if (root == root->fs_info->csum_root && trans->adding_csums) block_rsv = trans->block_rsv; if (!block_rsv) @@ -4031,13 +4145,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); + rsv->type = type; } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, + unsigned short type) { struct btrfs_block_rsv *block_rsv; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4046,7 +4162,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv); + btrfs_init_block_rsv(block_rsv, type); block_rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); return block_rsv; @@ -4055,13 +4171,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { + if (!rsv) + return; btrfs_block_rsv_release(root, rsv, (u64)-1); kfree(rsv); } -static inline int __block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int flush) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) { int ret; @@ -4077,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 0); -} - int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { @@ -4109,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int flush) +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -4139,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) @@ -4286,6 +4376,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + if (!trans->block_rsv) + return; + if (!trans->bytes_reserved) return; @@ -4330,10 +4423,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; /* - * two for root back/forward refs, two for directory entries - * and one for root of the snapshot. + * two for root back/forward refs, two for directory entries, + * one for root of the snapshot and one for parent inode. */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); dst_rsv->space_info = src_rsv->space_info; return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4440,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 csum_bytes; unsigned nr_extents = 0; int extra_reserve = 0; - int flush = 1; - int ret; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + bool delalloc_lock = true; - /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(root, inode)) - flush = 0; + /* If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } - if (flush && btrfs_transaction_in_commit(root->fs_info)) + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + if (delalloc_lock) + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); + num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4476,7 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (root->fs_info->quota_enabled) + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + + /* + * ret != 0 here means the qgroup reservation failed, we go straight to + * the shared error handling then. + */ + if (ret == 0) + ret = reserve_metadata_bytes(root, block_rsv, + to_reserve, flush); + if (ret) { u64 to_free = 0; unsigned dropped; @@ -4506,7 +4620,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + nr_extents * root->leafsize); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4518,7 +4637,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) trace_btrfs_space_reservation(root->fs_info,"delalloc", @@ -4554,6 +4675,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4863,9 +4989,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 len; + bool readonly; while (start <= end) { + readonly = false; if (!cache || start >= cache->key.objectid + cache->key.offset) { if (cache) @@ -4883,15 +5013,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } start += len; + space_info = cache->space_info; - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - if (cache->ro) - cache->space_info->bytes_readonly += len; + space_info->bytes_pinned -= len; + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + if (!readonly && global_rsv->space_info == space_info) { + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + len = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += len; + space_info->bytes_may_use += len; + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + } + spin_unlock(&global_rsv->lock); + } + spin_unlock(&space_info->lock); } if (cache) @@ -4918,7 +5063,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); + EXTENT_DIRTY, NULL); if (ret) break; @@ -4996,8 +5141,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -5015,8 +5162,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } extent_slot = path->slots[0]; } } else if (ret == -ENOENT) { @@ -5030,7 +5179,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)owner_objectid, (unsigned long long)owner_offset); } else { - goto abort; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } leaf = path->nodes[0]; @@ -5040,8 +5190,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(found_extent || extent_slot != path->slots[0]); ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); path->leave_spinning = 1; @@ -5058,8 +5210,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } - if (ret < 0) - goto abort; + if (ret < 0) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + extent_slot = path->slots[0]; leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, extent_slot); @@ -5096,8 +5251,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } } else { if (found_extent) { @@ -5114,27 +5271,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } ret = update_block_group(trans, root, bytenr, num_bytes, 0); - if (ret) - goto abort; + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } } out: btrfs_free_path(path); return ret; - -abort: - btrfs_abort_transaction(trans, extent_root, ret); - goto out; } /* @@ -5190,8 +5349,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); /* * we don't take a ref on the node because we're removing it from the @@ -5348,7 +5505,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } -static int __get_block_group_index(u64 flags) +int __get_raid_index(u64 flags) { int index; @@ -5368,7 +5525,7 @@ static int __get_block_group_index(u64 flags) static int get_block_group_index(struct btrfs_block_group_cache *cache) { - return __get_block_group_index(cache->flags); + return __get_raid_index(cache->flags); } enum btrfs_loop_type { @@ -5399,11 +5556,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; int empty_cluster = 2 * 1024 * 1024; - int allowed_chunk_alloc = 0; - int done_chunk_alloc = 0; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; @@ -5432,9 +5587,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (btrfs_mixed_space_info(space_info)) use_cluster = false; - if (orig_root->ref_cows || empty_size) - allowed_chunk_alloc = 1; - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) @@ -5708,10 +5860,6 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); if (used_block_group != block_group) btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); @@ -5744,30 +5892,17 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - if (ret < 0) { - btrfs_abort_transaction(trans, - root, ret); - goto out; - } - allowed_chunk_alloc = 0; - if (ret == 1) - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == - CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + ret = do_chunk_alloc(trans, root, data, + CHUNK_ALLOC_FORCE); + /* + * Do not bail out on ENOSPC since we + * can do more things. + */ + if (ret < 0 && ret != -ENOSPC) { + btrfs_abort_transaction(trans, + root, ret); + goto out; } - - /* - * We didn't allocate a chunk, go ahead and drop the - * empty size and loop again. - */ - if (!done_chunk_alloc) - loop = LOOP_NO_EMPTY_SIZE; } if (loop == LOOP_NO_EMPTY_SIZE) { @@ -5816,13 +5951,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", + printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", (unsigned long long)cache->key.objectid, (unsigned long long)cache->key.offset, (unsigned long long)btrfs_block_group_used(&cache->item), (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); + (unsigned long long)cache->reserved, + cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } @@ -5842,20 +5977,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, data = btrfs_get_alloc_profile(root, data); again: - /* - * the only place that sets empty_size is btrfs_realloc_node, which - * is not called recursively on allocations - */ - if (empty_size || root->ref_cows) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, - CHUNK_ALLOC_NO_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - } - WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, hint_byte, ins, data); @@ -5865,12 +5986,6 @@ again: num_bytes = num_bytes >> 1; num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -6193,7 +6308,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -6212,15 +6328,15 @@ use_block_rsv(struct btrfs_trans_handle *trans, ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; - if (ret) { + if (ret && !block_rsv->failfast) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) { - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); - WARN_ON(1); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", + ret); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { @@ -6670,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } @@ -7177,7 +7295,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, alloc_flags = update_block_group_flags(root, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7187,7 +7305,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + ret = do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; @@ -7201,7 +7319,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type) { u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + return do_chunk_alloc(trans, root, alloc_flags, CHUNK_ALLOC_FORCE); } @@ -7351,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ target = get_restripe_target(root->fs_info, block_group->flags); if (target) { - index = __get_block_group_index(extended_to_chunk(target)); + index = __get_raid_index(extended_to_chunk(target)); } else { /* * this is just a balance, so if we were marked as full @@ -7385,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * check to make sure we can actually find a chunk with enough * space to fit our block group in. */ - if (device->total_bytes > device->bytes_used + min_free) { + if (device->total_bytes > device->bytes_used + min_free && + !device->is_tgtdev_for_dev_replace) { ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) @@ -7610,8 +7729,21 @@ int btrfs_read_block_groups(struct btrfs_root *root) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - if (need_clear) + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ cache->disk_cache_state = BTRFS_DC_CLEAR; + if (btrfs_test_opt(root, SPACE_CACHE)) + cache->dirty = 1; + } read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -7695,6 +7827,34 @@ error: return ret; } +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, + new_bg_list) { + list_del_init(&block_group->new_bg_list); + + if (ret) + continue; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); + } +} + int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -7728,6 +7888,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); btrfs_init_free_space_ctl(cache); @@ -7759,12 +7920,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); /* Logic error */ - ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, - sizeof(cache->item)); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - return ret; - } + list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 01c21b6c6d4..1b319df29ee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -45,6 +45,7 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; + unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree) int __init extent_io_init(void) { - extent_state_cache = kmem_cache_create("extent_state", + extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = kmem_cache_create("extent_buffers", + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) @@ -107,6 +108,12 @@ void extent_io_exit(void) list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } + + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); if (extent_state_cache) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) @@ -334,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", + if (end < start) + WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", (unsigned long long)end, (unsigned long long)start); - WARN_ON(1); - } state->start = start; state->end = end; @@ -929,12 +934,14 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, /** - * convert_extent - convert all bits in a given range from one bit to another + * convert_extent_bit - convert all bits in a given range from one bit to + * another * @tree: the io tree to search * @start: the start offset in bytes * @end: the end offset in bytes (inclusive) * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist @@ -944,7 +951,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask) + int bits, int clear_bits, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -961,6 +969,15 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + state->tree) { + node = &state->rb_node; + goto hit_next; + } + } + /* * this search will find all the extents that end after * our range starts. @@ -991,6 +1008,7 @@ hit_next: */ if (state->start == start && state->end <= end) { set_state_bits(tree, state, &bits); + cache_state(state, cached_state); state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; @@ -1031,6 +1049,7 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); + cache_state(state, cached_state); state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; @@ -1069,6 +1088,7 @@ hit_next: &bits); if (err) extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1091,6 +1111,7 @@ hit_next: extent_io_tree_panic(tree, err); set_state_bits(tree, prealloc, &bits); + cache_state(prealloc, cached_state); clear_state_bit(tree, prealloc, &clear_bits, 0); prealloc = NULL; goto out; @@ -1143,6 +1164,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, NULL, cached_state, mask); } +int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { @@ -1287,18 +1316,42 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) + u64 *start_ret, u64 *end_ret, int bits, + struct extent_state **cached_state) { struct extent_state *state; + struct rb_node *n; int ret = 1; spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && state->tree) { + n = rb_next(&state->rb_node); + while (n) { + state = rb_entry(n, struct extent_state, + rb_node); + if (state->state & bits) + goto got_it; + n = rb_next(n); + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + state = find_first_extent_bit_state(tree, start, bits); +got_it: if (state) { + cache_state(state, cached_state); *start_ret = state->start; *end_ret = state->end; ret = 0; } +out: spin_unlock(&tree->lock); return ret; } @@ -1864,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err) * the standard behavior is to write all copies in a raid setup. here we only * want to write the one bad copy. so we do the mapping for ourselves and issue * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which + * to avoid any synchronization issues, wait for the data after writing, which * actually prevents the read that triggered the error from finishing. * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 length, u64 logical, struct page *page, int mirror_num) { @@ -1891,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, bio->bi_size = 0; map_length = length; - ret = btrfs_map_block(map_tree, WRITE, logical, + ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { bio_put(bio); @@ -1918,7 +1971,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return -EIO; } - printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " + printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " "(dev %s sector %llu)\n", page->mapping->host->i_ino, start, rcu_str_deref(dev->name), sector); @@ -1929,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, + ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, start, p, mirror_num); if (ret) break; @@ -1955,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_mapping_tree *map_tree; + struct btrfs_fs_info *fs_info; struct extent_state *state; int num_copies; int did_repair = 0; @@ -1991,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page) spin_unlock(&BTRFS_I(inode)->io_tree.lock); if (state && state->start == failrec->start) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - num_copies = btrfs_num_copies(map_tree, failrec->logical, - failrec->len); + fs_info = BTRFS_I(inode)->root->fs_info; + num_copies = btrfs_num_copies(fs_info, failrec->logical, + failrec->len); if (num_copies > 1) { - ret = repair_io_failure(map_tree, start, failrec->len, + ret = repair_io_failure(fs_info, start, failrec->len, failrec->logical, page, failrec->failed_mirror); did_repair = !ret; @@ -2061,7 +2113,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, } read_unlock(&em_tree->lock); - if (!em || IS_ERR(em)) { + if (!em) { kfree(failrec); return -EIO; } @@ -2104,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * clean_io_failure() clean all those errors at once. */ } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); if (num_copies == 1) { /* * we only have a single copy of the data, so don't bother with @@ -2297,8 +2348,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; - pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " - "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " + "mirror=%ld\n", (u64)bio->bi_sector, err, (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -2329,23 +2380,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); - if (ret) { - /* no IO indicated but software detected errors - * in the block, either checksum errors or - * issues with the contents */ - struct btrfs_root *root = - BTRFS_I(page->mapping->host)->root; - struct btrfs_device *device; - + if (ret) uptodate = 0; - device = btrfs_find_device_for_logical( - root, start, mirror); - if (device) - btrfs_dev_stat_inc_and_print(device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - } else { + else clean_io_failure(start, page); - } } if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { @@ -2424,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -/* - * Since writes are async, they will only return -ENOMEM. - * Reads can return the full range of I/O error conditions. - */ static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { @@ -2715,12 +2749,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); - BUG_ON(ret == -ENOMEM); - nr++; - *bio_flags = this_bio_flag; + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } } - if (ret) + if (ret) { SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1); + } cur = cur + iosize; pg_offset += iosize; } @@ -3077,8 +3114,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, } } + /* + * We need to do this to prevent races in people who check if the eb is + * under IO since we can end up having no IO bits set for a short period + * of time. + */ + spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); spin_lock(&fs_info->delalloc_lock); if (fs_info->dirty_metadata_bytes >= eb->len) @@ -3087,6 +3131,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, WARN_ON(1); spin_unlock(&fs_info->delalloc_lock); ret = 1; + } else { + spin_unlock(&eb->refs_lock); } btrfs_tree_unlock(eb); @@ -3158,12 +3204,16 @@ static int write_one_eb(struct extent_buffer *eb, struct block_device *bdev = fs_info->fs_devices->latest_bdev; u64 offset = eb->start; unsigned long i, num_pages; + unsigned long bio_flags = 0; int rw = (epd->sync_io ? WRITE_SYNC : WRITE); int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); + if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) + bio_flags = EXTENT_BIO_TREE_LOG; + for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); @@ -3172,7 +3222,8 @@ static int write_one_eb(struct extent_buffer *eb, ret = submit_extent_page(rw, eb->tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, - 0, 0, 0); + 0, epd->bio_flags, bio_flags); + epd->bio_flags = bio_flags; if (ret) { set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); SetPageError(p); @@ -3207,6 +3258,7 @@ int btree_write_cache_pages(struct address_space *mapping, .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; int ret = 0; int done = 0; @@ -3251,19 +3303,34 @@ retry: break; } + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + continue; + } + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON + * but no sense in crashing the users box for something + * we can survive anyway. + */ if (!eb) { + spin_unlock(&mapping->private_lock); WARN_ON(1); continue; } - if (eb == prev_eb) + if (eb == prev_eb) { + spin_unlock(&mapping->private_lock); continue; + } - if (!atomic_inc_not_zero(&eb->refs)) { - WARN_ON(1); + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) continue; - } prev_eb = eb; ret = lock_extent_buffer_for_io(eb, fs_info, &epd); @@ -3454,7 +3521,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->sync_io) rw = WRITE_SYNC; - ret = submit_one_bio(rw, epd->bio, 0, 0); + ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -3477,6 +3544,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -3501,6 +3569,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, + .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -3540,6 +3609,7 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .bio_flags = 0, }; ret = extent_write_cache_pages(tree, mapping, wbc, @@ -3557,19 +3627,38 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; unsigned long bio_flags = 0; + struct page *pagepool[16]; + struct page *page; + int i = 0; + int nr = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); + page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { - __extent_read_full_page(tree, page, get_extent, - &bio, 0, &bio_flags); + page_cache_release(page); + continue; } - page_cache_release(page); + + pagepool[nr++] = page; + if (nr < ARRAY_SIZE(pagepool)) + continue; + for (i = 0; i < nr; i++) { + __extent_read_full_page(tree, pagepool[i], get_extent, + &bio, 0, &bio_flags); + page_cache_release(pagepool[i]); + } + nr = 0; + } + for (i = 0; i < nr; i++) { + __extent_read_full_page(tree, pagepool[i], get_extent, + &bio, 0, &bio_flags); + page_cache_release(pagepool[i]); } + BUG_ON(!list_empty(pages)); if (bio) return submit_one_bio(READ, bio, 0, bio_flags); @@ -3898,18 +3987,6 @@ out: return ret; } -inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - return eb->pages[i]; -} - -inline unsigned long num_extent_pages(u64 start, u64 len) -{ - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); -} - static void __free_extent_buffer(struct extent_buffer *eb) { #if LEAK_DEBUG @@ -4025,8 +4102,8 @@ st |