diff options
Diffstat (limited to 'fs')
916 files changed, 69471 insertions, 38924 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 945aa5f02f9..a9ea73d6dcf 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, uint16_t klen = 0; v9ses = (struct v9fs_session_info *)cookie_netfs_data; - P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses, - buffer, bufmax); + p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n", + v9ses, buffer, bufmax); if (v9ses->cachetag) klen = strlen(v9ses->cachetag); @@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, return 0; memcpy(buffer, v9ses->cachetag, klen); - P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag); + p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag); return klen; } @@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, v9ses); - P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses, - v9ses->fscache); + p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", + v9ses, v9ses->fscache); } void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses, - v9ses->fscache); + p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", + v9ses, v9ses->fscache); fscache_relinquish_cookie(v9ses->fscache, 0); v9ses->fscache = NULL; } @@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, - v9inode->qid.path); + p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n", + &v9inode->vfs_inode, v9inode->qid.path); return sizeof(v9inode->qid.path); } @@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, const struct v9fs_inode *v9inode = cookie_netfs_data; *size = i_size_read(&v9inode->vfs_inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, - *size); + p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n", + &v9inode->vfs_inode, *size); } static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, @@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, - v9inode->qid.version); + p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n", + &v9inode->vfs_inode, v9inode->qid.version); return sizeof(v9inode->qid.version); } @@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &v9fs_cache_inode_index_def, v9inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", + inode, v9inode->fscache); } void v9fs_cache_inode_put_cookie(struct inode *inode) @@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode) if (!v9inode->fscache) return; - P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", + inode, v9inode->fscache); fscache_relinquish_cookie(v9inode->fscache, 0); v9inode->fscache = NULL; @@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) if (!v9inode->fscache) return; - P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", + inode, v9inode->fscache); fscache_relinquish_cookie(v9inode->fscache, 1); v9inode->fscache = NULL; @@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, v9inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", - inode, old, v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", + inode, old, v9inode->fscache); spin_unlock(&v9inode->fscache_lock); } @@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); if (!v9inode->fscache) return -ENOBUFS; @@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case -ENOBUFS: case -ENODATA: - P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret); + p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret); return 1; case 0: - P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); return ret; default: - P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); return ret; } } @@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode, int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); + p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages); if (!v9inode->fscache) return -ENOBUFS; @@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode, switch (ret) { case -ENOBUFS: case -ENODATA: - P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret); + p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret); return 1; case 0: BUG_ON(!list_empty(pages)); BUG_ON(*nr_pages != 0); - P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); return ret; default: - P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); return ret; } } @@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); - P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); + p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); if (ret != 0) v9fs_uncache_page(inode, page); } @@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) { const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); if (PageFsCache(page)) fscache_wait_on_page_write(v9inode->fscache, page); } diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 85b67ffa2a4..da8eefbe830 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { struct v9fs_dentry *dent; - P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n", - fid->fid, dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n", + fid->fid, dentry->d_name.name); dent = dentry->d_fsdata; if (!dent) { @@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) struct v9fs_dentry *dent; struct p9_fid *fid, *ret; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", - dentry->d_name.name, dentry, uid, any); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", + dentry->d_name.name, dentry, uid, any); dent = (struct v9fs_dentry *) dentry->d_fsdata; ret = NULL; if (dent) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 2b78014a124..b85efa77394 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -23,6 +23,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> @@ -85,15 +87,15 @@ static int get_cache_mode(char *s) if (!strcmp(s, "loose")) { version = CACHE_LOOSE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { version = CACHE_FSCACHE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "none")) { version = CACHE_NONE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); } else - printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s); + pr_info("Unknown Cache mode %s\n", s); return version; } @@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_debug: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_dfltuid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_dfltgid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_afid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; - P9_DPRINTK(P9_DEBUG_ERROR, - "problem allocating copy of cache arg\n"); + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of cache arg\n"); goto free_and_return; } ret = get_cache_mode(s); @@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; - P9_DPRINTK(P9_DEBUG_ERROR, - "problem allocating copy of access arg\n"); + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of access arg\n"); goto free_and_return; } @@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->uid = simple_strtoul(s, &e, 10); if (*e != '\0') { ret = -EINVAL; - printk(KERN_INFO "9p: Unknown access " - "argument %s.\n", s); + pr_info("Unknown access argument %s\n", + s); kfree(s); goto free_and_return; } @@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_POSIX_ACL; #else - P9_DPRINTK(P9_DEBUG_ERROR, - "Not defined CONFIG_9P_FS_POSIX_ACL. " - "Ignoring posixacl option\n"); + p9_debug(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n"); #endif break; @@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; - P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n"); + p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); goto error; } @@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(fid)) { retval = PTR_ERR(fid); fid = NULL; - P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n"); + p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); goto error; } @@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); + p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); p9_client_disconnect(v9ses->clnt); } @@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); + p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); p9_client_begin_disconnect(v9ses->clnt); } @@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void) static int __init init_v9fs(void) { int err; - printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); + pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ - err = register_filesystem(&v9fs_fs_type); - if (err < 0) { - printk(KERN_ERR "Failed to register filesystem\n"); - return err; - } err = v9fs_cache_register(); if (err < 0) { - printk(KERN_ERR "Failed to register v9fs for caching\n"); - goto out_fs_unreg; + pr_err("Failed to register v9fs for caching\n"); + return err; } err = v9fs_sysfs_init(); if (err < 0) { - printk(KERN_ERR "Failed to register with sysfs\n"); + pr_err("Failed to register with sysfs\n"); + goto out_cache; + } + err = register_filesystem(&v9fs_fs_type); + if (err < 0) { + pr_err("Failed to register filesystem\n"); goto out_sysfs_cleanup; } @@ -616,8 +617,8 @@ static int __init init_v9fs(void) out_sysfs_cleanup: v9fs_sysfs_cleanup(); -out_fs_unreg: - unregister_filesystem(&v9fs_fs_type); +out_cache: + v9fs_cache_unregister(); return err; } diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 410ffd6ceb5..dc95a252523 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode, dev_t); + struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 2524e4cbb8e..0ad61c6a65a 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) struct inode *inode; inode = page->mapping->host; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); BUG_ON(!PageLocked(page)); @@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, struct inode *inode; inode = mapping->host; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); if (ret == 0) return ret; ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); - P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret); + p9_debug(P9_DEBUG_VFS, " = %d\n", ret); return ret; } @@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * Now that we do caching with cache mode enabled, We need * to support direct IO */ - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " - "off/no(%lld/%lu) EINVAL\n", - iocb->ki_filp->f_path.dentry->d_name.name, - (long long) pos, nr_segs); + p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long)pos, nr_segs); return -EINVAL; } diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index e022890c6f4..d529437ff44 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,8 +53,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry) { - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); return 1; } @@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry) */ static int v9fs_cached_dentry_delete(const struct dentry *dentry) { - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); /* Don't cache negative dentries */ if (!dentry->d_inode) @@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry) struct v9fs_dentry *dent; struct p9_fid *temp, *current_fid; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); dent = dentry->d_fsdata; if (dent) { list_for_each_entry_safe(current_fid, temp, &dent->fidlist, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 598fff1a54e..ff911e77965 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) int reclen = 0; struct p9_rdir *rdir; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; @@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; p9stat_free(&st); goto unlock_and_exit; @@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, struct p9_dirent curdirent; u64 oldoffset = 0; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; @@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, rdir->tail - rdir->head, &curdirent); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; goto unlock_and_exit; } @@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) struct p9_fid *fid; fid = filp->private_data; - P9_DPRINTK(P9_DEBUG_VFS, - "v9fs_dir_release: inode: %p filp: %p fid: %d\n", - inode, filp, fid ? fid->fid : -1); + p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); if (fid) p9_client_clunk(fid); return 0; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 62857a810a7..fc06fd27065 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct p9_fid *fid; int omode; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) @@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) int res = 0; struct inode *inode = filp->f_path.dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); + p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - schedule_timeout_interruptible(P9_LOCK_TIMEOUT); + if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + break; } /* map 9p status to VFS status */ @@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) struct inode *inode = filp->f_path.dentry->d_inode; int ret = -ENOLCK; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, - cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", + filp, cmd, fl, filp->f_path.dentry->d_name.name); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct inode *inode = filp->f_path.dentry->d_inode; int ret = -ENOLCK; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, - cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", + filp, cmd, fl, filp->f_path.dentry->d_name.name); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, { int n, total, size; - P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, - (long long unsigned) offset, count); + p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", + fid->fid, (long long unsigned)offset, count); n = 0; total = 0; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, struct p9_fid *fid; size_t size; - P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); + p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, loff_t origin = *offset; unsigned long pg_start, pg_end; - P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, - (int)count, (int)*offset); + p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", + data, (int)count, (int)*offset); clnt = fid->clnt; do { @@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, return retval; mutex_lock(&inode->i_mutex); - P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); @@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, return retval; mutex_lock(&inode->i_mutex); - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", - filp, datasync); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; @@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = filp->f_path.dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", - page, (unsigned long)filp->private_data); + p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 879ed885173..57ccb7537da 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -23,6 +23,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> @@ -59,15 +61,13 @@ static const struct inode_operations v9fs_symlink_inode_operations; * */ -static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) +static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode) { int res; res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; if (v9fs_proto_dotu(v9ses)) { - if (S_ISLNK(mode)) - res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { if (S_ISSOCK(mode)) res |= P9_DMSOCKET; @@ -85,10 +85,33 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) res |= P9_DMSETGID; if ((mode & S_ISVTX) == S_ISVTX) res |= P9_DMSETVTX; - if ((mode & P9_DMLINK)) - res |= P9_DMLINK; } + return res; +} +/** + * p9mode2perm- convert plan9 mode bits to unix permission bits + * @v9ses: v9fs session information + * @stat: p9_wstat from which mode need to be derived + * + */ +static int p9mode2perm(struct v9fs_session_info *v9ses, + struct p9_wstat *stat) +{ + int res; + int mode = stat->mode; + + res = mode & S_IALLUGO; + if (v9fs_proto_dotu(v9ses)) { + if ((mode & P9_DMSETUID) == P9_DMSETUID) + res |= S_ISUID; + + if ((mode & P9_DMSETGID) == P9_DMSETGID) + res |= S_ISGID; + + if ((mode & P9_DMSETVTX) == P9_DMSETVTX) + res |= S_ISVTX; + } return res; } @@ -99,14 +122,14 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) * @rdev: major number, minor number in case of device files. * */ -static int p9mode2unixmode(struct v9fs_session_info *v9ses, - struct p9_wstat *stat, dev_t *rdev) +static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) { int res; - int mode = stat->mode; + u32 mode = stat->mode; - res = mode & S_IALLUGO; *rdev = 0; + res = p9mode2perm(v9ses, stat); if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; @@ -133,24 +156,13 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, res |= S_IFBLK; break; default: - P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c %s\n", type, - stat->extension); + p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n", + type, stat->extension); }; *rdev = MKDEV(major, minor); } else res |= S_IFREG; - if (v9fs_proto_dotu(v9ses)) { - if ((mode & P9_DMSETUID) == P9_DMSETUID) - res |= S_ISUID; - - if ((mode & P9_DMSETGID) == P9_DMSETGID) - res |= S_ISGID; - - if ((mode & P9_DMSETVTX) == P9_DMSETVTX) - res |= S_ISVTX; - } return res; } @@ -251,7 +263,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) static void v9fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } @@ -261,7 +272,7 @@ void v9fs_destroy_inode(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode, dev_t rdev) + struct inode *inode, umode_t mode, dev_t rdev) { int err = 0; @@ -281,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; } else { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); + p9_debug(P9_DEBUG_ERROR, + "special files without extended mode\n"); err = -EINVAL; goto error; } @@ -307,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; case S_IFLNK: if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " - "legacy protocol.\n"); + p9_debug(P9_DEBUG_ERROR, + "extended modes used with legacy protocol\n"); err = -EINVAL; goto error; } @@ -335,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; default: - P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", - mode, mode & S_IFMT); + p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", + mode, mode & S_IFMT); err = -EINVAL; goto error; } @@ -352,17 +363,18 @@ error: * */ -struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) { int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); inode = new_inode(sb); if (!inode) { - P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + pr_warn("%s (%d): Problem allocating inode\n", + __func__, task_pid_nr(current)); return ERR_PTR(-ENOMEM); } err = v9fs_init_inode(v9ses, inode, mode, rdev); @@ -436,7 +448,7 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode *v9inode = V9FS_I(inode); truncate_inode_pages(inode->i_mapping, 0); - end_writeback(inode); + clear_inode(inode); filemap_fdatawrite(inode->i_mapping); #ifdef CONFIG_9P_FSCACHE @@ -492,7 +504,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, int new) { dev_t rdev; - int retval, umode; + int retval; + umode_t umode; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; @@ -578,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) struct p9_fid *v9fid, *dfid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", - dir, dentry, flags); + p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", + dir, dentry, flags); v9ses = v9fs_inode2v9ses(dir); inode = dentry->d_inode; dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); return retval; } if (v9fs_proto_dotl(v9ses)) @@ -635,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct p9_fid *dfid, *ofid, *fid; struct inode *inode; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; ofid = NULL; @@ -644,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return ERR_PTR(err); } @@ -652,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); return ERR_PTR(err); } err = p9_client_fcreate(ofid, name, perm, mode, extension); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); goto error; } - /* now walk from the parent so we can get unopened fid */ - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; - goto error; - } - - /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); - goto error; + if (!(perm & P9_DMLINK)) { + /* now walk from the parent so we can get unopened fid */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, + "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + /* + * instantiate inode and assign the unopened fid to the dentry + */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, + "inode creation failed %d\n", err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - d_instantiate(dentry, inode); return ofid; error: if (ofid) @@ -703,7 +721,7 @@ error: */ static int -v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int err; @@ -786,14 +804,14 @@ error: * */ -static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int err; u32 perm; struct p9_fid *fid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode | S_IFDIR); @@ -831,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, char *name; int result = 0; - P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_name.name, dentry, nameidata); + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", + dir, dentry->d_name.name, dentry, nameidata); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -938,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; @@ -974,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, * 9P .u can only handle file rename in the same directory */ - P9_DPRINTK(P9_DEBUG_ERROR, - "old dir and new dir are different\n"); + p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; goto clunk_newdir; } @@ -1031,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct p9_fid *fid; struct p9_wstat *st; - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { @@ -1068,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_fid *fid; struct p9_wstat wstat; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = inode_change_ok(dentry->d_inode, iattr); if (retval) return retval; @@ -1131,7 +1148,7 @@ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { - mode_t mode; + umode_t mode; char ext[32]; char tag_name[14]; unsigned int i_nlink; @@ -1167,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, set_nlink(inode, i_nlink); } } - mode = stat->mode & S_IALLUGO; + mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; i_size_write(inode, stat->length); @@ -1213,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct p9_fid *fid; struct p9_wstat *st; - P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); retval = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); @@ -1235,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) /* copy extension buffer into buffer */ strncpy(buffer, st->extension, buflen); - P9_DPRINTK(P9_DEBUG_VFS, - "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); + p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n", + dentry->d_name.name, st->extension, buffer); retval = strnlen(buffer, buflen); done: @@ -1257,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) int len = 0; char *link = __getname(); - P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); if (!link) link = ERR_PTR(-ENOMEM); @@ -1288,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); - P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, - IS_ERR(s) ? "<error>" : s); + p9_debug(P9_DEBUG_VFS, " %s %s\n", + dentry->d_name.name, IS_ERR(s) ? "<error>" : s); if (!IS_ERR(s)) __putname(s); } @@ -1304,19 +1321,17 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) */ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, - int mode, const char *extension) + u32 perm, const char *extension) { - u32 perm; struct p9_fid *fid; struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(dir); if (!v9fs_proto_dotu(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); + p9_debug(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } - perm = unixmode2p9mode(v9ses, mode); fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm, P9_OREAD); if (IS_ERR(fid)) @@ -1340,10 +1355,10 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, static int v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, - dentry->d_name.name, symname); + p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", + dir->i_ino, dentry->d_name.name, symname); - return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); + return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } /** @@ -1362,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, char *name; struct p9_fid *oldfid; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, - old_dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", + dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); oldfid = v9fs_fid_clone(old_dentry); if (IS_ERR(oldfid)) @@ -1398,14 +1412,16 @@ clunk_fid: */ static int -v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; char *name; + u32 perm; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry->d_name.name, mode, + MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; @@ -1427,7 +1443,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return -EINVAL; } - retval = v9fs_vfs_mkspecial(dir, dentry, mode, name); + perm = unixmode2p9mode(v9ses, mode); + retval = v9fs_vfs_mkspecial(dir, dentry, perm, name); __putname(name); return retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 0b5745e2194..e3dd2a1e2bf 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -48,7 +48,7 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** @@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) return current_fsgid(); } -/** - * v9fs_dentry_from_dir_inode - helper function to get the dentry from - * dir inode. - * - */ - -static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) -{ - struct dentry *dentry; - - spin_lock(&inode->i_lock); - /* Directory should have only one entry. */ - BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); - dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); - spin_unlock(&inode->i_lock); - return dentry; -} - static int v9fs_test_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); @@ -253,7 +235,7 @@ int v9fs_open_to_dotl_flags(int flags) */ static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct nameidata *nd) { int err = 0; @@ -283,13 +265,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, } name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " - "mode:0x%x\n", name, flags, omode); + p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", + name, flags, omode); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } @@ -297,7 +279,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); return err; } @@ -307,16 +289,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in creat %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + err); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), mode, gid, &qid); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_open_dotl failed in creat %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + err); goto error; } v9fs_invalidate_inode_attr(dir); @@ -325,14 +306,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -395,7 +376,7 @@ err_clunk_old_fid: */ static int v9fs_vfs_mkdir_dotl(struct inode *dir, - struct dentry *dentry, int omode) + struct dentry *dentry, umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -408,7 +389,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); @@ -416,11 +397,11 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, if (dir->i_mode & S_ISGID) omode |= S_ISGID; - dir_dentry = v9fs_dentry_from_dir_inode(dir); + dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } @@ -430,8 +411,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mkdir %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", + err); goto error; } name = (char *) dentry->d_name.name; @@ -444,8 +425,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -453,8 +434,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -495,7 +476,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, struct p9_fid *fid; struct p9_stat_dotl *st; - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { @@ -523,6 +504,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, return 0; } +/* + * Attribute flags. + */ +#define P9_ATTR_MODE (1 << 0) +#define P9_ATTR_UID (1 << 1) +#define P9_ATTR_GID (1 << 2) +#define P9_ATTR_SIZE (1 << 3) +#define P9_ATTR_ATIME (1 << 4) +#define P9_ATTR_MTIME (1 << 5) +#define P9_ATTR_CTIME (1 << 6) +#define P9_ATTR_ATIME_SET (1 << 7) +#define P9_ATTR_MTIME_SET (1 << 8) + +struct dotl_iattr_map { + int iattr_valid; + int p9_iattr_valid; +}; + +static int v9fs_mapped_iattr_valid(int iattr_valid) +{ + int i; + int p9_iattr_valid = 0; + struct dotl_iattr_map dotl_iattr_map[] = { + { ATTR_MODE, P9_ATTR_MODE }, + { ATTR_UID, P9_ATTR_UID }, + { ATTR_GID, P9_ATTR_GID }, + { ATTR_SIZE, P9_ATTR_SIZE }, + { ATTR_ATIME, P9_ATTR_ATIME }, + { ATTR_MTIME, P9_ATTR_MTIME }, + { ATTR_CTIME, P9_ATTR_CTIME }, + { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, + { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, + }; + for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { + if (iattr_valid & dotl_iattr_map[i].iattr_valid) + p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; + } + return p9_iattr_valid; +} + /** * v9fs_vfs_setattr_dotl - set file metadata * @dentry: file whose metadata to set @@ -537,13 +558,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) struct p9_fid *fid; struct p9_iattr_dotl p9attr; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = inode_change_ok(dentry->d_inode, iattr); if (retval) return retval; - p9attr.valid = iattr->ia_valid; + p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); p9attr.mode = iattr->ia_mode; p9attr.uid = iattr->ia_uid; p9attr.gid = iattr->ia_gid; @@ -594,7 +615,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { - mode_t mode; + umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { @@ -670,14 +691,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, struct v9fs_session_info *v9ses; name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", - dir->i_ino, name, symname); + p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } @@ -687,7 +707,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); goto error; } @@ -697,8 +717,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -707,8 +727,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -751,12 +771,11 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", - dir->i_ino, old_dentry->d_name.name, - dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); v9ses = v9fs_inode2v9ses(dir); - dir_dentry = v9fs_dentry_from_dir_inode(dir); + dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); @@ -770,7 +789,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; } @@ -799,7 +818,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; @@ -813,19 +832,19 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); + p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry->d_name.name, omode, + MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; v9ses = v9fs_inode2v9ses(dir); - dir_dentry = v9fs_dentry_from_dir_inode(dir); + dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } @@ -835,8 +854,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mknod %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", + err); goto error; } name = (char *) dentry->d_name.name; @@ -851,8 +870,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -860,8 +879,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -905,7 +924,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) char *link = __getname(); char *target; - P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); if (!link) { link = ERR_PTR(-ENOMEM); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index c70251d47ed..8c92a9ba833 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -117,11 +117,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - int mode = S_IRWXUGO | S_ISVTX; + umode_t mode = S_IRWXUGO | S_ISVTX; struct p9_fid *fid; int retval = 0; - P9_DPRINTK(P9_DEBUG_VFS, " \n"); + p9_debug(P9_DEBUG_VFS, "\n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) @@ -155,9 +155,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { - iput(inode); retval = -ENOMEM; goto release_sb; } @@ -191,7 +190,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; v9fs_fid_add(root, fid); - P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); clunk_fid: @@ -223,7 +222,7 @@ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); + p9_debug(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); @@ -231,7 +230,7 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); s->s_fs_info = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); + p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); } static void @@ -260,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { - buf->f_type = V9FS_MAGIC; + buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; @@ -303,7 +302,7 @@ static int v9fs_write_inode(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; @@ -326,7 +325,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index d288773871b..29653b70a9c 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { retval = PTR_ERR(attr_fid); - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_attrwalk failed %zd\n", retval); + p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n", + retval); attr_fid = NULL; goto error; } @@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, { struct p9_fid *fid; - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", - __func__, name, buffer_size); + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n", + name, buffer_size); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, int retval, msize, write_count; struct p9_fid *fid = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", - __func__, name, value_len, flags); + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", + name, value_len, flags); fid = v9fs_fid_clone(dentry); if (IS_ERR(fid)) { @@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, */ retval = p9_client_xattrcreate(fid, name, value_len, flags); if (retval < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_xattrcreate failed %d\n", retval); + p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", + retval); goto error; } msize = fid->clnt->msize; diff --git a/fs/Kconfig b/fs/Kconfig index 5f4c45d4aa1..f95ae3a027f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -4,6 +4,10 @@ menu "File systems" +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS + bool + if BLOCK source "fs/ext2/Kconfig" @@ -210,6 +214,7 @@ source "fs/minix/Kconfig" source "fs/omfs/Kconfig" source "fs/hpfs/Kconfig" source "fs/qnx4/Kconfig" +source "fs/qnx6/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" @@ -218,6 +223,8 @@ source "fs/exofs/Kconfig" endif # MISC_FILESYSTEMS +source "fs/exofs/Kconfig.ore" + menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" default y @@ -266,14 +273,6 @@ source "fs/9p/Kconfig" endif # NETWORK_FILESYSTEMS -if BLOCK -menu "Partition Types" - -source "fs/partitions/Kconfig" - -endmenu -endif - source "fs/nls/Kconfig" source "fs/dlm/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 79e2ca7973b..02257420274 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,10 +27,13 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF +config ARCH_BINFMT_ELF_RANDOMIZE_PIE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y - depends on (FRV || BLACKFIN || (SUPERH32 && !MMU)) + depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) help ELF FDPIC binaries are based on ELF, but allow the individual load segments of a binary to be located in memory independently of each diff --git a/fs/Makefile b/fs/Makefile index d2c3353d547..2fb97793467 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,8 @@ else obj-y += no-block.o endif +obj-$(CONFIG_PROC_FS) += proc_namespace.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o @@ -52,7 +54,6 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ @@ -101,6 +102,7 @@ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ +obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index c8bf36a1996..06fdcc9382c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -126,9 +126,9 @@ static void adfs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int adfs_show_options(struct seq_file *seq, struct dentry *root) { - struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb); + struct adfs_sb_info *asb = ADFS_SB(root->d_sb); if (asb->s_uid != 0) seq_printf(seq, ",uid=%u", asb->s_uid); @@ -483,10 +483,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &adfs_dentry_operations; root = adfs_iget(sb, &root_obj); - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { int i; - iput(root); for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); diff --git a/fs/affs/affs.h b/fs/affs/affs.h index c2b9c79eb64..1fceb320d2f 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -18,14 +18,6 @@ #define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) -#ifdef __LITTLE_ENDIAN -#define BO_EXBITS 0x18UL -#elif defined(__BIG_ENDIAN) -#define BO_EXBITS 0x00UL -#else -#error Endianness must be known for affs to work. -#endif - #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) #define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) #define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) @@ -136,7 +128,7 @@ extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); -extern mode_t prot_to_mode(u32 prot); +extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); @@ -156,8 +148,8 @@ extern void affs_free_bitmap(struct super_block *sb); extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *); -extern int affs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *); +extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index de37ec84234..52a6407682e 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -390,10 +390,10 @@ secs_to_datestamp(time_t secs, struct affs_date *ds) ds->ticks = cpu_to_be32(secs * 50); } -mode_t +umode_t prot_to_mode(u32 prot) { - int mode = 0; + umode_t mode = 0; if (!(prot & FIBF_NOWRITE)) mode |= S_IWUSR; @@ -421,7 +421,7 @@ void mode_to_prot(struct inode *inode) { u32 prot = AFFS_I(inode)->i_protect; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; if (!(mode & S_IXUSR)) prot |= FIBF_NOEXECUTE; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 88a4b0b5005..8bc4a59f4e7 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -264,7 +264,7 @@ affs_evict_inode(struct inode *inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); affs_free_prealloc(inode); cache_page = (unsigned long)AFFS_I(inode)->i_lc; if (cache_page) { diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 780a11dc631..47806940aac 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -255,13 +255,13 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; struct inode *inode; int error; - pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len, + pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len, dentry->d_name.name,mode); inode = affs_new_inode(dir); @@ -285,12 +285,12 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata } int -affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int error; - pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino, + pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino, (int)dentry->d_name.len,dentry->d_name.name,mode); inode = affs_new_inode(dir); diff --git a/fs/affs/super.c b/fs/affs/super.c index b31507d0f9b..0782653a05a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -98,7 +98,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb) static void affs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } @@ -474,7 +473,7 @@ got_root: root_inode = affs_iget(sb, root_block); if (IS_ERR(root_inode)) { ret = PTR_ERR(root_inode); - goto out_error_noinode; + goto out_error; } if (AFFS_SB(sb)->s_flags & SF_INTL) @@ -482,7 +481,7 @@ got_root: else sb->s_d_op = &affs_dentry_operations; - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk(KERN_ERR "AFFS: Get root inode failed\n"); goto out_error; @@ -495,9 +494,6 @@ got_root: * Begin the cascaded cleanup ... */ out_error: - if (root_inode) - iput(root_inode); -out_error_noinode: kfree(sbi->s_bitmap); affs_brelse(root_bh); kfree(sbi->s_prefix); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1b0b1955001..e22dc4b4a50 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,9 +28,9 @@ static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct inode *dir, struct dentry *dentry, int mode, +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); -static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, @@ -764,7 +764,7 @@ static void afs_d_release(struct dentry *dentry) /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_file_status status; struct afs_callback cb; @@ -777,7 +777,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%o", + _enter("{%x:%u},{%s},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); ret = -ENAMETOOLONG; @@ -948,7 +948,7 @@ error: /* * create a regular file on an AFS filesystem */ -static int afs_create(struct inode *dir, struct dentry *dentry, int mode, +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct afs_file_status status; @@ -962,7 +962,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode, dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%o,", + _enter("{%x:%u},{%s},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); ret = -ENAMETOOLONG; diff --git a/fs/afs/file.c b/fs/afs/file.c index 14d89fa58fe..8f6e9234d56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -251,7 +251,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping, ASSERT(key != NULL); vnode = AFS_FS_I(mapping->host); - if (vnode->flags & AFS_VNODE_DELETED) { + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { _leave(" = -ESTALE"); return -ESTALE; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 2f213d109c2..b960ff05ea0 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -365,10 +365,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page, KM_USER0); + buffer = kmap_atomic(page); ret = afs_extract_data(call, skb, last, buffer, call->count); - kunmap_atomic(buffer, KM_USER0); + kunmap_atomic(buffer); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -411,9 +411,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page, KM_USER0); + buffer = kmap_atomic(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer, KM_USER0); + kunmap_atomic(buffer); } _leave(" = 0 [done]"); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index d890ae3b2ce..95cffd38239 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -423,7 +423,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); afs_give_up_callback(vnode); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index d2b0888126d..a306bb6d88d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -109,7 +109,7 @@ struct afs_call { unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned short offset; /* offset into received data store */ + unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index aa59184151d..298cf8919ec 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -200,9 +200,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (PageError(page)) goto error; - buf = kmap_atomic(page, KM_USER0); + buf = kmap_atomic(page); memcpy(devname, buf, size); - kunmap_atomic(buf, KM_USER0); + kunmap_atomic(buf); page_cache_release(page); page = NULL; } @@ -242,7 +242,7 @@ struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name); + _enter("{%s}", path->dentry->d_name.name); newmnt = afs_mntpt_do_automount(path->dentry); if (IS_ERR(newmnt)) @@ -252,7 +252,7 @@ struct vfsmount *afs_d_automount(struct path *path) mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); - _leave(" = %p {%s}", newmnt, newmnt->mnt_devname); + _leave(" = %p", newmnt); return newmnt; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e45a323aebb..8ad8c2a0703 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; + struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); rxrpc_kernel_end_call(rxcall); call->rxcall = NULL; error_kill_call: diff --git a/fs/afs/super.c b/fs/afs/super.c index 356dcf0929e..f02b31e7e64 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -301,7 +301,6 @@ static int afs_fill_super(struct super_block *sb, { struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; - struct dentry *root = NULL; struct inode *inode = NULL; int ret; @@ -327,18 +326,16 @@ static int afs_fill_super(struct super_block *sb, set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; - root = d_alloc_root(inode); - if (!root) + sb->s_root = d_make_root(inode); + if (!sb->s_root) goto error; sb->s_d_op = &afs_fs_dentry_operations; - sb->s_root = root; _leave(" = 0"); return 0; error: - iput(inode); _leave(" = %d", ret); return ret; } @@ -495,7 +492,6 @@ static void afs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct afs_vnode *vnode = AFS_FS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(afs_inode_cachep, vnode); } @@ -13,7 +13,7 @@ #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/uio.h> @@ -93,9 +93,8 @@ static void aio_free_ring(struct kioctx *ctx) put_page(info->ring_pages[i]); if (info->mmap_size) { - down_write(&ctx->mm->mmap_sem); - do_munmap(ctx->mm, info->mmap_base, info->mmap_size); - up_write(&ctx->mm->mmap_sem); + BUG_ON(ctx->mm != current->mm); + vm_munmap(info->mmap_base, info->mmap_size); } if (info->ring_pages && info->ring_pages != info->internal_pages) @@ -135,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx) info->mmap_size = nr_pages * PAGE_SIZE; dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, - 0); + info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); info->mmap_size = 0; @@ -160,7 +159,7 @@ static int aio_setup_ring(struct kioctx *ctx) info->nr = nr_events; /* trusted copy */ - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(info->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -168,47 +167,38 @@ static int aio_setup_ring(struct kioctx *ctx) ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); - kunmap_atomic(ring, KM_USER0); + kunmap_atomic(ring); return 0; } /* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); + * kmap_atomic(). Release the pointer with put_aio_ring_event(); */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ +#define aio_ring_event(info, nr) ({ \ unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ + (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ __event += pos % AIO_EVENTS_PER_PAGE; \ __event; \ }) -#define put_aio_ring_event(event, km) do { \ +#define put_aio_ring_event(event) do { \ struct io_event *__event = (event); \ (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ + kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ } while(0) static void ctx_rcu_free(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - unsigned nr_events = ctx->max_reqs; - kmem_cache_free(kioctx_cachep, ctx); - - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); - } } /* __put_ioctx @@ -217,23 +207,23 @@ static void ctx_rcu_free(struct rcu_head *head) */ static void __put_ioctx(struct kioctx *ctx) { + unsigned nr_events = ctx->max_reqs; BUG_ON(ctx->reqs_active); - cancel_delayed_work(&ctx->wq); - cancel_work_sync(&ctx->wq.work); + cancel_delayed_work_sync(&ctx->wq); aio_free_ring(ctx); mmdrop(ctx->mm); ctx->mm = NULL; + if (nr_events) { + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - nr_events > aio_nr); + aio_nr -= nr_events; + spin_unlock(&aio_nr_lock); + } pr_debug("__put_ioctx: freeing %p\n", ctx); call_rcu(&ctx->rcu_head, ctx_rcu_free); } -static inline void get_ioctx(struct kioctx *kioctx) -{ - BUG_ON(atomic_read(&kioctx->users) <= 0); - atomic_inc(&kioctx->users); -} - static inline int try_get_ioctx(struct kioctx *kioctx) { return atomic_inc_not_zero(&kioctx->users); @@ -253,7 +243,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm; struct kioctx *ctx; - int did_sync = 0; + int err = -ENOMEM; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -262,7 +252,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if ((unsigned long)nr_events > aio_max_nr) + if (!nr_events || (unsigned long)nr_events > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -273,7 +263,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) mm = ctx->mm = current->mm; atomic_inc(&mm->mm_count); - atomic_set(&ctx->users, 1); + atomic_set(&ctx->users, 2); spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); @@ -286,25 +276,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) goto out_freectx; /* limit the number of system wide aios */ - do { - spin_lock_bh(&aio_nr_lock); - if (aio_nr + nr_events > aio_max_nr || - aio_nr + nr_events < aio_nr) - ctx->max_reqs = 0; - else - aio_nr += ctx->max_reqs; - spin_unlock_bh(&aio_nr_lock); - if (ctx->max_reqs || did_sync) - break; - - /* wait for rcu callbacks to have completed before giving up */ - synchronize_rcu(); - did_sync = 1; - ctx->max_reqs = nr_events; - } while (1); - - if (ctx->max_reqs == 0) + spin_lock(&aio_nr_lock); + if (aio_nr + nr_events > aio_max_nr || + aio_nr + nr_events < aio_nr) { + spin_unlock(&aio_nr_lock); goto out_cleanup; + } + aio_nr += ctx->max_reqs; + spin_unlock(&aio_nr_lock); /* now link into global list. */ spin_lock(&mm->ioctx_lock); @@ -316,27 +295,27 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ctx; out_cleanup: - __put_ioctx(ctx); - return ERR_PTR(-EAGAIN); - + err = -EAGAIN; + aio_free_ring(ctx); out_freectx: mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - ctx = ERR_PTR(-ENOMEM); - - dprintk("aio: error allocating ioctx %p\n", ctx); - return ctx; + dprintk("aio: error allocating ioctx %d\n", err); + return ERR_PTR(err); } -/* aio_cancel_all +/* kill_ctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void aio_cancel_all(struct kioctx *ctx) +static void kill_ctx(struct kioctx *ctx) { int (*cancel)(struct kiocb *, struct io_event *); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); struct io_event res; + spin_lock_irq(&ctx->ctx_lock); ctx->dead = 1; while (!list_empty(&ctx->active_reqs)) { @@ -352,15 +331,7 @@ static void aio_cancel_all(struct kioctx *ctx) spin_lock_irq(&ctx->ctx_lock); } } - spin_unlock_irq(&ctx->ctx_lock); -} -static void wait_for_all_aios(struct kioctx *ctx) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - - spin_lock_irq(&ctx->ctx_lock); if (!ctx->reqs_active) goto out; @@ -410,19 +381,24 @@ void exit_aio(struct mm_struct *mm) ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); hlist_del_rcu(&ctx->list); - aio_cancel_all(ctx); - - wait_for_all_aios(ctx); - /* - * Ensure we don't leave the ctx on the aio_wq - */ - cancel_work_sync(&ctx->wq.work); + kill_ctx(ctx); if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", atomic_read(&ctx->users), ctx->dead, ctx->reqs_active); + /* + * We don't need to bother with munmap() here - + * exit_mmap(mm) is coming and it'll unmap everything. + * Since aio_free_ring() uses non-zero ->mmap_size + * as indicator that it needs to unmap the area, + * just set it to 0; aio_free_ring() is the only + * place that uses ->mmap_size, so it's safe. + * That way we get all munmap done to current->mm - + * all other callers have ctx->mm == current->mm. + */ + ctx->ring_info.mmap_size = 0; put_ioctx(ctx); } } @@ -476,14 +452,23 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total) batch->count = total; } -static void kiocb_batch_free(struct kiocb_batch *batch) +static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) { struct kiocb *req, *n; + if (list_empty(&batch->head)) + return; + + spin_lock_irq(&ctx->ctx_lock); list_for_each_entry_safe(req, n, &batch->head, ki_batch) { list_del(&req->ki_batch); + list_del(&req->ki_list); kmem_cache_free(kiocb_cachep, req); + ctx->reqs_active--; } + if (unlikely(!ctx->reqs_active && ctx->dead)) + wake_up_all(&ctx->wait); + spin_unlock_irq(&ctx->ctx_lock); } /* @@ -600,11 +585,16 @@ static void aio_fput_routine(struct work_struct *data) fput(req->ki_filp); /* Link the iocb into the context's free list */ + rcu_read_lock(); spin_lock_irq(&ctx->ctx_lock); really_put_req(ctx, req); + /* + * at that point ctx might've been killed, but actual + * freeing is RCU'd + */ spin_unlock_irq(&ctx->ctx_lock); + rcu_read_unlock(); - put_ioctx(ctx); spin_lock_irq(&fput_lock); } spin_unlock_irq(&fput_lock); @@ -635,7 +625,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) * this function will be executed w/out any aio kthread wakeup. */ if (unlikely(!fput_atomic(req->ki_filp))) { - get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); @@ -913,7 +902,7 @@ static void aio_kick_handler(struct work_struct *work) unuse_mm(mm); set_fs(oldfs); /* - * we're in a worker thread already, don't use queue_delayed_work, + * we're in a worker thread already; no point using non-zero delay */ if (requeue) queue_delayed_work(aio_wq, &ctx->wq, 0); @@ -1012,10 +1001,10 @@ int aio_complete(struct kiocb *iocb, long res, long res2) if (kiocbIsCancelled(iocb)) goto put_rq; - ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + ring = kmap_atomic(info->ring_pages[0]); tail = info->tail; - event = aio_ring_event(info, tail, KM_IRQ0); + event = aio_ring_event(info, tail); if (++tail >= info->nr) tail = 0; @@ -1036,8 +1025,8 @@ int aio_complete(struct kiocb *iocb, long res, long res2) info->tail = tail; ring->tail = tail; - put_aio_ring_event(event, KM_IRQ0); - kunmap_atomic(ring, KM_IRQ1); + put_aio_ring_event(event); + kunmap_atomic(ring); pr_debug("added to ring %p at [%lu]\n", iocb, tail); @@ -1082,7 +1071,7 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) unsigned long head; int ret = 0; - ring = kmap_atomic(info->ring_pages[0], KM_USER0); + ring = kmap_atomic(info->ring_pages[0]); dprintk("in aio_read_evt h%lu t%lu m%lu\n", (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); @@ -1094,18 +1083,18 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) head = ring->head % info->nr; if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + struct io_event *evp = aio_ring_event(info, head); *ent = *evp; head = (head + 1) % info->nr; smp_mb(); /* finish reading the event before updatng the head */ ring->head = head; ret = 1; - put_aio_ring_event(evp, KM_USER1); + put_aio_ring_event(evp); } spin_unlock(&info->ring_lock); out: - kunmap_atomic(ring, KM_USER0); + kunmap_atomic(ring); dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, (unsigned long)ring->head, (unsigned long)ring->tail); return ret; @@ -1283,8 +1272,7 @@ static void io_destroy(struct kioctx *ioctx) if (likely(!was_dead)) put_ioctx(ioctx); /* twice for the list */ - aio_cancel_all(ioctx); - wait_for_all_aios(ioctx); + kill_ctx(ioctx); /* * Wake up any waiters. The setting of ctx->dead must be seen @@ -1292,7 +1280,6 @@ static void io_destroy(struct kioctx *ioctx) * locking done by the above calls to ensure this consistency. */ wake_up_all(&ioctx->wait); - put_ioctx(ioctx); /* once for the lookup */ } /* sys_io_setup: @@ -1329,11 +1316,9 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) - return 0; - - get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ - io_destroy(ioctx); + if (ret) + io_destroy(ioctx); + put_ioctx(ioctx); } out: @@ -1351,6 +1336,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { io_destroy(ioctx); + put_ioctx(ioctx); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1460,13 +1446,17 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) ret = compat_rw_copy_check_uvector(type, (struct compat_iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec, 1); + &kiocb->ki_iovec); else #endif ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec, 1); + &kiocb->ki_iovec); + if (ret < 0) + goto out; + + ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); if (ret < 0) goto out; @@ -1481,11 +1471,17 @@ out: return ret; } -static ssize_t aio_setup_single_vector(struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) { + int bytes; + + bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); + if (bytes < 0) + return bytes; + kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_left; + kiocb->ki_iovec->iov_len = bytes; kiocb->ki_nr_segs = 1; kiocb->ki_cur_seg = 0; return 0; @@ -1510,10 +1506,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, kiocb->ki_left))) break; - ret = security_file_permission(file, MAY_READ); - if (unlikely(ret)) - break; - ret = aio_setup_single_vector(kiocb); + ret = aio_setup_single_vector(READ, file, kiocb); if (ret) break; ret = -EINVAL; @@ -1528,10 +1521,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, kiocb->ki_left))) break; - ret = security_file_permission(file, MAY_WRITE); - if (unlikely(ret)) - break; - ret = aio_setup_single_vector(kiocb); + ret = aio_setup_single_vector(WRITE, file, kiocb); if (ret) break; ret = -EINVAL; @@ -1542,9 +1532,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) break; - ret = security_file_permission(file, MAY_READ); - if (unlikely(ret)) - break; ret = aio_setup_vectored_rw(READ, kiocb, compat); if (ret) break; @@ -1556,9 +1543,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) break; - ret = security_file_permission(file, MAY_WRITE); - if (unlikely(ret)) - break; ret = aio_setup_vectored_rw(WRITE, kiocb, compat); if (ret) break; @@ -1742,7 +1726,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - kiocb_batch_free(&batch); + kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index f11e43ed907..28d39fb84ae 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -39,19 +39,6 @@ static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; -static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "anon_inode:", NULL, - &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); -} - -static struct file_system_type anon_inode_fs_type = { - .name = "anon_inodefs", - .mount = anon_inodefs_mount, - .kill_sb = kill_anon_super, -}; - /* * nop .set_page_dirty method so that people can use .page_mkwrite on * anon inodes. @@ -65,6 +52,62 @@ static const struct address_space_operations anon_aops = { .set_page_dirty = anon_set_page_dirty, }; +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +static struct inode *anon_inode_mkinode(struct super_block *s) +{ + struct inode *inode = new_inode_pseudo(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = get_next_ino(); + inode->i_fop = &anon_inode_fops; + + inode->i_mapping->a_ops = &anon_aops; + + /* + * Mark the inode dirty from the very beginning, + * that way it will never be moved to the dirty + * list because mark_inode_dirty() will think + * that it already _is_ on the dirty list. + */ + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_flags |= S_PRIVATE; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct dentry *root; + root = mount_pseudo(fs_type, "anon_inode:", NULL, + &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); + if (!IS_ERR(root)) { + struct super_block *s = root->d_sb; + anon_inode_inode = anon_inode_mkinode(s); + if (IS_ERR(anon_inode_inode)) { + dput(root); + deactivate_locked_super(s); + root = ERR_CAST(anon_inode_inode); + } + } + return root; +} + +static struct file_system_type anon_inode_fs_type = { + .name = "anon_inodefs", + .mount = anon_inodefs_mount, + .kill_sb = kill_anon_super, +}; + /** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" @@ -180,38 +223,6 @@ err_put_unused_fd: } EXPORT_SYMBOL_GPL(anon_inode_getfd); -/* - * A single inode exists for all anon_inode files. Contrary to pipes, - * anon_inode inodes have no associated per-instance data, so we need - * only allocate one of them. - */ -static struct inode *anon_inode_mkinode(void) -{ - struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); - - if (!inode) - return ERR_PTR(-ENOMEM); - - inode->i_ino = get_next_ino(); - inode->i_fop = &anon_inode_fops; - - inode->i_mapping->a_ops = &anon_aops; - - /* - * Mark the inode dirty from the very beginning, - * that way it will never be moved to the dirty - * list because mark_inode_dirty() will think - * that it already _is_ on the dirty list. - */ - inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - return inode; -} - static int __init anon_inode_init(void) { int error; @@ -224,16 +235,8 @@ static int __init anon_inode_init(void) error = PTR_ERR(anon_inode_mnt); goto err_unregister_filesystem; } - anon_inode_inode = anon_inode_mkinode(); - if (IS_ERR(anon_inode_inode)) { - error = PTR_ERR(anon_inode_inode); - goto err_mntput; - } - return 0; -err_mntput: - kern_unmount(anon_inode_mnt); err_unregister_filesystem: unregister_filesystem(&anon_inode_fs_type); err_exit: diff --git a/fs/attr.c b/fs/attr.c index 7ee7ba48831..0da90951d27 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -5,7 +5,7 @@ * changes by Thomas Schoebel-Theuer */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> @@ -47,14 +47,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - (current_fsuid() != inode->i_uid || - attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) + (!uid_eq(current_fsuid(), inode->i_uid) || + !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && - (current_fsuid() != inode->i_uid || - (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && + (!uid_eq(current_fsuid(), inode->i_uid) || + (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && !capable(CAP_CHOWN)) return -EPERM; @@ -166,7 +166,7 @@ EXPORT_SYMBOL(setattr_copy); int notify_change(struct dentry * dentry, struct iattr * attr) { struct inode *inode = dentry->d_inode; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error; struct timespec now; unsigned int ia_valid = attr->ia_valid; @@ -176,8 +176,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return -EPERM; } + if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) { + if (attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + } + if ((ia_valid & ATTR_MODE)) { - mode_t amode = attr->ia_mode; + umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ if (is_sxid(amode)) inode->i_flags &= ~S_NOSEC; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 326dc08d3e3..908e1845541 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -116,6 +116,7 @@ struct autofs_sb_info { int needs_reghost; struct super_block *sb; struct mutex wq_mutex; + struct mutex pipe_mutex; spinlock_t fs_lock; struct autofs_wait_queue *queues; /* Wait queue pointer */ spinlock_t lookup_lock; @@ -155,7 +156,7 @@ static inline int autofs4_ispending(struct dentry *dentry) return 0; } -struct inode *autofs4_get_inode(struct super_block *, mode_t); +struct inode *autofs4_get_inode(struct super_block *, umode_t); void autofs4_free_ino(struct autofs_info *); /* Expiration */ @@ -268,6 +269,17 @@ int autofs4_fill_super(struct super_block *, void *, int); struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); void autofs4_clean_ino(struct autofs_info *); +static inline int autofs_prepare_pipe(struct file *pipe) +{ + if (!pipe->f_op || !pipe->f_op->write) + return -EINVAL; + if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode)) + return -EINVAL; + /* We want a packet pipe */ + pipe->f_flags |= O_DIRECT; + return 0; +} + /* Queue management functions */ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 509fe1eb66a..aa9103f8f01 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -194,7 +194,7 @@ static int find_autofs_mount(const char *pathname, return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { - if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); if (!err) /* already found some */ @@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname, static int test_by_dev(struct path *path, void *p) { - return path->mnt->mnt_sb->s_dev == *(dev_t *)p; + return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(struct path *path, void *p) @@ -230,7 +230,7 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - FD_SET(fd, fdt->close_on_exec); + __set_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } @@ -376,7 +376,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, err = -EBADF; goto out; } - if (!pipe->f_op || !pipe->f_op->write) { + if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; @@ -538,11 +538,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; - devid = new_encode_dev(path.mnt->mnt_sb->s_dev); + devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; - magic = path.mnt->mnt_sb->s_magic; + magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; @@ -556,7 +556,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); if (follow_down_one(&path)) - magic = path.mnt->mnt_sb->s_magic; + magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 450f529a4ea..1feb68ecef9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -124,6 +124,7 @@ start: /* Negative dentry - try next */ if (!simple_positive(q)) { spin_unlock(&p->d_lock); + lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); p = q; goto again; } @@ -186,6 +187,7 @@ again: /* Negative dentry - try next */ if (!simple_positive(ret)) { spin_unlock(&p->d_lock); + lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); p = ret; goto again; } diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index c038727b405..cddc74b9cdb 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -31,11 +31,11 @@ static int __init init_autofs4_fs(void) { int err; + autofs_dev_ioctl_init(); + err = register_filesystem(&autofs_fs_type); if (err) - return err; - - autofs_dev_ioctl_init(); + autofs_dev_ioctl_exit(); return err; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 8179f1ab817..8a4fed8ead3 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -70,10 +70,10 @@ out_kill_sb: kill_litter_super(sb); } -static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) +static int autofs4_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb); - struct inode *root_inode = mnt->mnt_sb->s_root->d_inode; + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct inode *root_inode = root->d_sb->s_root->d_inode; if (!sbi) return 0; @@ -100,7 +100,7 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) static void autofs4_evict_inode(struct inode *inode) { - end_writeback(inode); + clear_inode(inode); kfree(inode->i_private); } @@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); + mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; spin_lock_init(&sbi->lookup_lock); @@ -244,12 +245,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (!ino) goto fail_free; root_inode = autofs4_get_inode(s, S_IFDIR | 0755); - if (!root_inode) - goto fail_ino; - - root = d_alloc_root(root_inode); + root = d_make_root(root_inode); if (!root) - goto fail_iput; + goto fail_ino; pipe = NULL; root->d_fsdata = ino; @@ -292,7 +290,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; } - if (!pipe->f_op || !pipe->f_op->write) + if (autofs_prepare_pipe(pipe) < 0) goto fail_fput; sbi->pipe = pipe; sbi->pipefd = pipefd; @@ -314,9 +312,6 @@ fail_fput: fail_dput: dput(root); goto fail_free; -fail_iput: - printk("autofs: get root dentry failed\n"); - iput(root_inode); fail_ino: kfree(ino); fail_free: @@ -326,7 +321,7 @@ fail_unlock: return -EINVAL; } -struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) +struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index f55ae23b137..75e5f1c8e02 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -26,7 +26,7 @@ static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); -static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); +static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); #ifdef CONFIG_COMPAT static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); @@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e1fbdeef85d..da8876d38a7 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct file *file, const void *addr, int bytes) +static int autofs4_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; mm_segment_t fs; const char *data = (const char *)addr; ssize_t wr = 0; - /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/ - sigpipe = sigismember(¤t->pending.signal, SIGPIPE); /* Save pointer to user space and point back to kernel space */ fs = get_fs(); set_fs(KERNEL_DS); + mutex_lock(&sbi->pipe_mutex); while (bytes && (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { data += wr; bytes -= wr; } + mutex_unlock(&sbi->pipe_mutex); set_fs(fs); @@ -110,6 +111,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; + mutex_lock(&sbi->wq_mutex); + + /* Check if we have become catatonic */ + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: @@ -163,22 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } default: printk("autofs4_notify_daemon: bad type %d!\n", type); + mutex_unlock(&sbi->wq_mutex); return; } - /* Check if we have become catatonic */ - mutex_lock(&sbi->wq_mutex); - if (!sbi->catatonic) { - pipe = sbi->pipe; - get_file(pipe); - } + pipe = sbi->pipe; + get_file(pipe); + mutex_unlock(&sbi->wq_mutex); - if (pipe) { - if (autofs4_write(pipe, &pkt, pktsz)) - autofs4_catatonic_mode(sbi); - fput(pipe); - } + if (autofs4_write(sbi, pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); } static int autofs4_getpath(struct autofs_sb_info *sbi, @@ -257,6 +261,9 @@ static int validate_request(struct autofs_wait_queue **wait, struct autofs_wait_queue *wq; struct autofs_info *ino; + if (sbi->catatonic) + return -ENOENT; + /* Wait in progress, continue; */ wq = autofs4_find_wait(sbi, qstr); if (wq) { @@ -289,6 +296,9 @@ static int validate_request(struct autofs_wait_queue **wait, if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; + if (sbi->catatonic) + return -ENOENT; + wq = autofs4_find_wait(sbi, qstr); if (wq) { *wait = wq; @@ -389,7 +399,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, ret = validate_request(&wq, sbi, &qstr, dentry, notify); if (ret <= 0) { - if (ret == 0) + if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); return ret; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9205cf25f1c..1b35d6bd06b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -9,7 +9,7 @@ */ #include <linux/fs.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> @@ -173,7 +173,7 @@ static const struct file_operations bad_file_ops = }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { return -EIO; } @@ -202,7 +202,7 @@ static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, } static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return -EIO; } @@ -213,7 +213,7 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) } static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { return -EIO; } @@ -292,7 +292,6 @@ static const struct inode_operations bad_inode_ops = .getxattr = bad_inode_getxattr, .listxattr = bad_inode_listxattr, .removexattr = bad_inode_removexattr, - /* truncate_range returns void */ }; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8342ca67abc..e18da23d42b 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -286,7 +286,6 @@ befs_alloc_inode(struct super_block *sb) static void befs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } @@ -853,9 +852,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto unacquire_priv_sbp; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 9cc07401947..d12c7966db2 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -84,7 +84,7 @@ const struct file_operations bfs_dir_operations = { extern void dump_imap(const char *, struct super_block *); -static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int err; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 697af5bf70b..9870417c26e 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -174,7 +174,7 @@ static void bfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (inode->i_nlink) return; @@ -251,7 +251,6 @@ static struct inode *bfs_alloc_inode(struct super_block *sb) static void bfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } @@ -368,9 +367,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) ret = PTR_ERR(inode); goto out2; } - s->s_root = d_alloc_root(inode); + s->s_root = d_make_root(inode); if (!s->s_root) { - iput(inode); ret = -ENOMEM; goto out2; } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a6395bdb26a..d146e181d10 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -26,7 +26,6 @@ #include <linux/coredump.h> #include <linux/slab.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/a.out-core.h> @@ -51,9 +50,7 @@ static int set_brk(unsigned long start, unsigned long end) end = PAGE_ALIGN(end); if (end > start) { unsigned long addr; - down_write(¤t->mm->mmap_sem); - addr = do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + addr = vm_brk(start, end - start); if (BAD_ADDR(addr)) return addr; } @@ -259,8 +256,14 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; @@ -275,9 +278,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) pos = 32; map_size = ex.a_text+ex.a_data; #endif - down_write(¤t->mm->mmap_sem); - error = do_brk(text_addr & PAGE_MASK, map_size); - up_write(¤t->mm->mmap_sem); + error = vm_brk(text_addr & PAGE_MASK, map_size); if (error != (text_addr & PAGE_MASK)) { send_sig(SIGKILL, current, 0); return error; @@ -308,9 +309,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { loff_t pos = fd_offset; - down_write(¤t->mm->mmap_sem); - do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - up_write(¤t->mm->mmap_sem); + vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); @@ -320,24 +319,20 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto beyond_if; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset); - up_write(¤t->mm->mmap_sem); if (error != N_TXTADDR(ex)) { send_sig(SIGKILL, current, 0); return error; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset + ex.a_text); - up_write(¤t->mm->mmap_sem); if (error != N_DATADDR(ex)) { send_sig(SIGKILL, current, 0); return error; @@ -352,13 +347,6 @@ beyond_if: return retval; } - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); #ifdef __alpha__ @@ -414,9 +402,7 @@ static int load_aout_library(struct file *file) "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); } - down_write(¤t->mm->mmap_sem); - do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - up_write(¤t->mm->mmap_sem); + vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); @@ -427,12 +413,10 @@ static int load_aout_library(struct file *file) goto out; } /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, start_addr, ex.a_text + ex.a_data, + error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, N_TXTOFF(ex)); - up_write(¤t->mm->mmap_sem); retval = error; if (error != start_addr) goto out; @@ -440,9 +424,7 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - down_write(¤t->mm->mmap_sem); - error = do_brk(start_addr + len, bss - len); - up_write(¤t->mm->mmap_sem); + error = vm_brk(start_addr + len, bss - len); retval = error; if (error != start_addr + len) goto out; @@ -454,7 +436,8 @@ out: static int __init init_aout_binfmt(void) { - return register_binfmt(&aout_format); + register_binfmt(&aout_format); + return 0; } static void __exit exit_aout_binfmt(void) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 21ac5ee4b43..1b52956afe3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,7 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> +#include <asm/exec.h> static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); @@ -81,9 +82,7 @@ static int set_brk(unsigned long start, unsigned long end) end = ELF_PAGEALIGN(end); if (end > start) { unsigned long addr; - down_write(¤t->mm->mmap_sem); - addr = do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + addr = vm_brk(start, end - start); if (BAD_ADDR(addr)) return addr; } @@ -227,10 +226,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec->e_entry); - NEW_AUX_ENT(AT_UID, cred->uid); - NEW_AUX_ENT(AT_EUID, cred->euid); - NEW_AUX_ENT(AT_GID, cred->gid); - NEW_AUX_ENT(AT_EGID, cred->egid); + NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -330,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, if (!size) return addr; - down_write(¤t->mm->mmap_sem); /* * total_size is the size of the ELF (interpreter) image. * The _first_ mmap needs to know the full size, otherwise @@ -341,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); - map_addr = do_mmap(filep, addr, total_size, prot, type, off); + map_addr = vm_mmap(filep, addr, total_size, prot, type, off); if (!BAD_ADDR(map_addr)) - do_munmap(current->mm, map_addr+size, total_size-size); + vm_munmap(map_addr+size, total_size-size); } else - map_addr = do_mmap(filep, addr, size, prot, type, off); + map_addr = vm_mmap(filep, addr, size, prot, type, off); - up_write(¤t->mm->mmap_sem); return(map_addr); } @@ -513,9 +510,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); /* Map the last of the bss segment */ - down_write(¤t->mm->mmap_sem); - error = do_brk(elf_bss, last_bss - elf_bss); - up_write(¤t->mm->mmap_sem); + error = vm_brk(elf_bss, last_bss - elf_bss); if (BAD_ADDR(error)) goto out_close; } @@ -712,7 +707,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_dentry; /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; /* Do this immediately, since STACK_TOP as used in setup_arg_pages @@ -794,7 +788,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#if defined(CONFIG_X86) || defined(CONFIG_ARM) +#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off * in runtime via sysctl. * If that is the case, retain the original non-zero @@ -934,7 +928,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) { @@ -963,10 +956,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, + error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); } #ifdef ELF_PLAT_INIT @@ -1051,8 +1042,7 @@ static int load_elf_library(struct file *file) eppnt++; /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, + error = vm_mmap(file, ELF_PAGESTART(eppnt->p_vaddr), (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), @@ -1060,7 +1050,6 @@ static int load_elf_library(struct file *file) MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); - up_write(¤t->mm->mmap_sem); if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; @@ -1073,11 +1062,8 @@ static int load_elf_library(struct file *file) len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) { - down_write(¤t->mm->mmap_sem); - do_brk(len, bss - len); - up_write(¤t->mm->mmap_sem); - } + if (bss > len) + vm_brk(len, bss - len); error = 0; out_free_ph: @@ -1095,6 +1081,29 @@ out: */ /* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* * Decide what to dump of a segment, part, all or none. */ static unsigned long vma_dump_size(struct vm_area_struct *vma, @@ -1102,10 +1111,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, { #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* The vma can be set up to tell us the answer directly. */ - if (vma->vm_flags & VM_ALWAYSDUMP) + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) goto whole; + if (vma->vm_flags & VM_NODUMP) + return 0; + /* Hugetlb memory check */ if (vma->vm_flags & VM_HUGETLB) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) @@ -1342,8 +1354,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); - SET_UID(psinfo->pr_uid, cred->uid); - SET_GID(psinfo->pr_gid, cred->gid); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); @@ -1390,6 +1402,22 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } +#ifndef PR_REG_SIZE +#define PR_REG_SIZE(S) sizeof(S) +#endif + +#ifndef PRSTATUS_SIZE +#define PRSTATUS_SIZE(S) sizeof(S) +#endif + +#ifndef PR_REG_PTR +#define PR_REG_PTR(S) (&((S)->pr_reg)) +#endif + +#ifndef SET_PR_FPVALID +#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#endif + static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, long signr, size_t *total) @@ -1404,11 +1432,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, */ fill_prstatus(&t->prstatus, t->task, signr); (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, sizeof(t->prstatus.pr_reg), - &t->prstatus.pr_reg, NULL); + 0, PR_REG_SIZE(t->prstatus.pr_reg), + PR_REG_PTR(&t->prstatus), NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - sizeof(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1421,7 +1449,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, for (i = 1; i < view->n; ++i) { const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); - if (regset->core_note_type && + if (regset->core_note_type && regset->get && (!regset->active || regset->active(t->task, regset))) { int ret; size_t size = regset->n * regset->size; @@ -1438,7 +1466,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - t->prstatus.pr_fpvalid = 1; + SET_PR_FPVALID(&t->prstatus, 1); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } @@ -2077,7 +2105,8 @@ out: static int __init init_elf_binfmt(void) { - return register_binfmt(&elf_format); + register_binfmt(&elf_format); + return 0; } static void __exit exit_elf_binfmt(void) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 30745f459fa..3d77cf81ba3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,6 +39,7 @@ #include <asm/uaccess.h> #include <asm/param.h> #include <asm/pgalloc.h> +#include <asm/exec.h> typedef char *elf_caddr_t; @@ -91,7 +92,8 @@ static struct linux_binfmt elf_fdpic_format = { static int __init init_elf_fdpic_binfmt(void) { - return register_binfmt(&elf_fdpic_format); + register_binfmt(&elf_fdpic_format); + return 0; } static void __exit exit_elf_fdpic_binfmt(void) @@ -334,8 +336,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, current->mm->context.exec_fdpic_loadmap = 0; current->mm->context.interp_fdpic_loadmap = 0; - current->flags &= ~PF_FORKNOEXEC; - #ifdef CONFIG_MMU elf_fdpic_arch_lay_out_mm(&exec_params, &interp_params, @@ -390,21 +390,17 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) stack_prot |= PROT_EXEC; - down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, + current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); if (IS_ERR_VALUE(current->mm->start_brk)) { - up_write(¤t->mm->mmap_sem); retval = current->mm->start_brk; current->mm->start_brk = 0; goto error_kill; } - up_write(¤t->mm->mmap_sem); - current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; current->mm->context.end_brk += @@ -413,7 +409,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, #endif install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) goto error_kill; @@ -632,10 +627,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); - NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); - NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); - NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); + NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -956,10 +951,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) mflags |= MAP_EXECUTABLE; - down_write(&mm->mmap_sem); - maddr = do_mmap(NULL, load_addr, top - base, + maddr = vm_mmap(NULL, load_addr, top - base, PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); - up_write(&mm->mmap_sem); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1097,10 +1090,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* create the mapping */ disp = phdr->p_vaddr & ~PAGE_MASK; - down_write(&mm->mmap_sem); - maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, + maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp); - up_write(&mm->mmap_sem); kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", loop, phdr->p_memsz + disp, prot, flags, @@ -1144,10 +1135,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, unsigned long xmaddr; flags |= MAP_FIXED | MAP_ANONYMOUS; - down_write(&mm->mmap_sem); - xmaddr = do_mmap(NULL, xaddr, excess - excess1, + xmaddr = vm_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); - up_write(&mm->mmap_sem); kdebug("mmap[%d] <anon>" " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", @@ -1432,8 +1421,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); - SET_UID(psinfo->pr_uid, cred->uid); - SET_GID(psinfo->pr_gid, cred->gid); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index b8e8b0acf9b..2790c7e1912 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -100,7 +100,8 @@ static struct linux_binfmt em86_format = { static int __init init_em86_binfmt(void) { - return register_binfmt(&em86_format); + register_binfmt(&em86_format); + return 0; } static void __exit exit_em86_binfmt(void) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 1bffbe0ed77..178cb70acc2 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,7 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> @@ -37,7 +37,6 @@ #include <linux/syscalls.h> #include <asm/byteorder.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/unaligned.h> #include <asm/cacheflush.h> @@ -543,10 +542,8 @@ static int load_flat_file(struct linux_binprm * bprm, */ DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); - down_write(¤t->mm->mmap_sem); - textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, + textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); - up_write(¤t->mm->mmap_sem); if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) textpos = (unsigned long) -ENOMEM; @@ -557,17 +554,15 @@ static int load_flat_file(struct linux_binprm * bprm, len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - down_write(¤t->mm->mmap_sem); - realdatastart = do_mmap(0, 0, len, + realdatastart = vm_mmap(0, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { if (!realdatastart) realdatastart = (unsigned long) -ENOMEM; printk("Unable to allocate RAM for process data, errno %d\n", (int)-realdatastart); - do_munmap(current->mm, textpos, text_len); + vm_munmap(textpos, text_len); ret = realdatastart; goto err; } @@ -591,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm, } if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); - do_munmap(current->mm, textpos, text_len); - do_munmap(current->mm, realdatastart, len); + vm_munmap(textpos, text_len); + vm_munmap(realdatastart, len); ret = result; goto err; } @@ -604,10 +599,8 @@ static int load_flat_file(struct linux_binprm * bprm, len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - down_write(¤t->mm->mmap_sem); - textpos = do_mmap(0, 0, len, + textpos = vm_mmap(0, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) @@ -661,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm, } if (IS_ERR_VALUE(result)) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); - do_munmap(current->mm, textpos, text_len + data_len + extra + + vm_munmap(textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); ret = result; goto err; @@ -902,7 +895,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) libinfo.lib_list[j].start_data:UNLOADED_LIB; install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; set_binfmt(&flat_format); @@ -950,7 +942,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) static int __init init_flat_binfmt(void) { - return register_binfmt(&flat_format); + register_binfmt(&flat_format); + return 0; } /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1e9edbdeda7..790b3cddca6 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/magic.h> #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> @@ -504,7 +505,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { - end_writeback(inode); + clear_inode(inode); kfree(inode->i_private); } @@ -560,7 +561,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 2: set_bit(Enabled, &e->flags); break; - case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); kill_node(e); @@ -587,7 +588,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, Node *e; struct inode *inode; struct dentry *root, *dentry; - struct super_block *sb = file->f_path.mnt->mnt_sb; + struct super_block *sb = file->f_path.dentry->d_sb; int err = 0; e = create_entry(buffer, count); @@ -666,7 +667,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, switch (res) { case 1: enabled = 0; break; case 2: enabled = 1; break; - case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); while (!list_empty(&entries)) @@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; - int err = simple_fill_super(sb, 0x42494e4d, bm_files); + int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; @@ -726,11 +727,8 @@ static struct file_system_type bm_fs_type = { static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); - if (!err) { - err = insert_binfmt(&misc_format); - if (err) - unregister_filesystem(&bm_fs_type); - } + if (!err) + insert_binfmt(&misc_format); return err; } diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 396a9884591..d3b8c1f6315 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -105,7 +105,8 @@ static struct linux_binfmt script_format = { static int __init init_script_binfmt(void) { - return register_binfmt(&script_format); + register_binfmt(&script_format); + return 0; } static void __exit exit_script_binfmt(void) diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index cc8560f6c9b..4517aaff61b 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -147,10 +147,8 @@ static int map_som_binary(struct file *file, code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize); current->mm->start_code = code_start; current->mm->end_code = code_start + code_size; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, code_start, code_size, prot, + retval = vm_mmap(file, code_start, code_size, prot, flags, SOM_PAGESTART(hpuxhdr->exec_tfile)); - up_write(¤t->mm->mmap_sem); if (retval < 0 && retval > -1024) goto out; @@ -158,20 +156,16 @@ static int map_som_binary(struct file *file, data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize); current->mm->start_data = data_start; current->mm->end_data = bss_start = data_start + data_size; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, data_start, data_size, + retval = vm_mmap(file, data_start, data_size, prot | PROT_WRITE, flags, SOM_PAGESTART(hpuxhdr->exec_dfile)); - up_write(¤t->mm->mmap_sem); if (retval < 0 && retval > -1024) goto out; som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize); current->mm->start_brk = current->mm->brk = som_brk; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(NULL, bss_start, som_brk - bss_start, + retval = vm_mmap(NULL, bss_start, som_brk - bss_start, prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (retval > 0 || retval < -1024) retval = 0; out: @@ -225,7 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free; /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; setup_new_exec(bprm); @@ -289,7 +282,8 @@ static int load_som_library(struct file *f) static int __init init_som_binfmt(void) { - return register_binfmt(&som_format); + register_binfmt(&som_format); + return 0; } static void __exit exit_som_binfmt(void) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index c2183f3917c..e85c04b9f61 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -357,7 +357,7 @@ static void bio_integrity_generate(struct bio *bio) bix.sector_size = bi->sector_size; bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + void *kaddr = kmap_atomic(bv->bv_page); bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; @@ -371,7 +371,7 @@ static void bio_integrity_generate(struct bio *bio) total += sectors * bi->tuple_size; BUG_ON(total > bio->bi_integrity->bip_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } } @@ -498,7 +498,7 @@ static int bio_integrity_verify(struct bio *bio) bix.sector_size = bi->sector_size; bio_for_each_segment(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + void *kaddr = kmap_atomic(bv->bv_page); bix.data_buf = kaddr + bv->bv_offset; bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; @@ -507,7 +507,7 @@ static int bio_integrity_verify(struct bio *bio) ret = bi->verify_fn(&bix); if (ret) { - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); return ret; } @@ -517,7 +517,7 @@ static int bio_integrity_verify(struct bio *bio) total += sectors * bi->tuple_size; BUG_ON(total > bio->bi_integrity->bip_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } return ret; @@ -19,12 +19,14 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> +#include <linux/cgroup.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -418,6 +420,7 @@ void bio_put(struct bio *bio) * last put frees it */ if (atomic_dec_and_test(&bio->bi_cnt)) { + bio_disassociate_task(bio); bio->bi_next = NULL; bio->bi_destructor(bio); } @@ -507,11 +510,12 @@ int bio_get_nr_vecs(struct block_device *bdev) struct request_queue *q = bdev_get_queue(bdev); int nr_pages; - nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > queue_max_segments(q)) - nr_pages = queue_max_segments(q); + nr_pages = min_t(unsigned, + queue_max_segments(q), + queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); + + return min_t(unsigned, nr_pages, BIO_MAX_PAGES); - return nr_pages; } EXPORT_SYMBOL(bio_get_nr_vecs); @@ -1645,6 +1649,64 @@ bad: } EXPORT_SYMBOL(bioset_create); +#ifdef CONFIG_BLK_CGROUP +/** + * bio_associate_current - associate a bio with %current + * @bio: target bio + * + * Associate @bio with %current if it hasn't been associated yet. Block + * layer will treat @bio as if it were issued by %current no matter which + * task actually issues it. + * + * This function takes an extra reference of @task's io_context and blkcg + * which will be put when @bio is released. The caller must own @bio, + * ensure %current->io_context exists, and is responsible for synchronizing + * calls to this function. + */ +int bio_associate_current(struct bio *bio) +{ + struct io_context *ioc; + struct cgroup_subsys_state *css; + + if (bio->bi_ioc) + return -EBUSY; + + ioc = current->io_context; + if (!ioc) + return -ENOENT; + + /* acquire active ref on @ioc and associate */ + get_io_context_active(ioc); + bio->bi_ioc = ioc; + + /* associate blkcg if exists */ + rcu_read_lock(); + css = task_subsys_state(current, blkio_subsys_id); + if (css && css_tryget(css)) + bio->bi_css = css; + rcu_read_unlock(); + + return 0; +} + +/** + * bio_disassociate_task - undo bio_associate_current() + * @bio: target bio + */ +void bio_disassociate_task(struct bio *bio) +{ + if (bio->bi_ioc) { + put_io_context(bio->bi_ioc); + bio->bi_ioc = NULL; + } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } +} + +#endif /* CONFIG_BLK_CGROUP */ + static void __init biovec_init_slabs(void) { int i; diff --git a/fs/block_dev.c b/fs/block_dev.c index b07f1da1de4..c2bbe1fb132 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -16,7 +16,9 @@ #include <linux/blkdev.h> #include <linux/module.h> #include <linux/blkpg.h> +#include <linux/magic.h> #include <linux/buffer_head.h> +#include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> @@ -24,7 +26,7 @@ #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> -#include <linux/kmemleak.h> +#include <linux/cleancache.h> #include <asm/uaccess.h> #include "internal.h" @@ -68,7 +70,7 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -static sector_t max_block(struct block_device *bdev) +sector_t blkdev_max_block(struct block_device *bdev) { sector_t retval = ~((sector_t)0); loff_t sz = i_size_read(bdev->bd_inode); @@ -82,13 +84,35 @@ static sector_t max_block(struct block_device *bdev) } /* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) +void kill_bdev(struct block_device *bdev) { - if (bdev->bd_inode->i_mapping->nrpages == 0) + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) return; + invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); + truncate_inode_pages(mapping, 0); } +EXPORT_SYMBOL(kill_bdev); + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_invalidate_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { @@ -139,7 +163,7 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= max_block(I_BDEV(inode))) { + if (iblock >= blkdev_max_block(I_BDEV(inode))) { if (create) return -EIO; @@ -161,7 +185,7 @@ static int blkdev_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - sector_t end_block = max_block(I_BDEV(inode)); + sector_t end_block = blkdev_max_block(I_BDEV(inode)); unsigned long max_blocks = bh->b_size >> inode->i_blkbits; if ((iblock + max_blocks) > end_block) { @@ -425,7 +449,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } @@ -464,7 +487,7 @@ static void bdev_evict_inode(struct inode *inode) struct list_head *p; truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); /* is it needed here? */ - end_writeback(inode); + clear_inode(inode); spin_lock(&bdev_lock); while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { __bd_forget(list_entry(p, struct inode, i_devices)); @@ -484,7 +507,7 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); } static struct file_system_type bd_type = { @@ -493,12 +516,12 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +static struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { int err; - struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -510,12 +533,7 @@ void __init bdev_cache_init(void) bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); - /* - * This vfsmount structure is only used to obtain the - * blockdev_superblock, so tell kmemleak not to report it. - */ - kmemleak_not_leak(bd_mnt); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* @@ -639,6 +657,11 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } +static inline int sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} + /* Call when you free inode */ void bd_forget(struct inode *inode) @@ -1117,6 +1140,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1137,6 +1161,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1159,8 +1184,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } if (ret) goto out_clear; } else { @@ -1190,8 +1219,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(bdev->bd_disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } if (ret) goto out_unlock_bdev; } @@ -1210,6 +1243,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ecb9fd3be14..d33f01c08b6 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + depends on BTRFS_FS + help + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index c0ddfd29c5e..0c4fa2befae 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o + reada.o backref.o ulist.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 89b156d85d6..761e2cd8fed 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); + } else { + cache_no_acl(inode); } + } else { + cache_no_acl(inode); } failed: posix_acl_release(acl); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 0b394580d86..42704149b72 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -171,11 +171,11 @@ out: spin_unlock_irqrestore(&workers->lock, flags); } -static noinline int run_ordered_completions(struct btrfs_workers *workers, +static noinline void run_ordered_completions(struct btrfs_workers *workers, struct btrfs_work *work) { if (!workers->ordered) - return 0; + return; set_bit(WORK_DONE_BIT, &work->flags); @@ -213,7 +213,6 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, } spin_unlock(&workers->order_lock); - return 0; } static void put_worker(struct btrfs_worker_thread *worker) @@ -334,7 +333,7 @@ again: if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); - refrigerator(); + try_to_freeze(); } else { spin_unlock_irq(&worker->lock); if (!kthread_should_stop()) { @@ -399,7 +398,7 @@ again: /* * this will wait for all the worker threads to shutdown */ -int btrfs_stop_workers(struct btrfs_workers *workers) +void btrfs_stop_workers(struct btrfs_workers *workers) { struct list_head *cur; struct btrfs_worker_thread *worker; @@ -427,7 +426,6 @@ int btrfs_stop_workers(struct btrfs_workers *workers) put_worker(worker); } spin_unlock_irq(&workers->lock); - return 0; } /* @@ -615,14 +613,14 @@ found: * it was taken from. It is intended for use with long running work functions * that make some progress and want to give the cpu up for others. */ -int btrfs_requeue_work(struct btrfs_work *work) +void btrfs_requeue_work(struct btrfs_work *work) { struct btrfs_worker_thread *worker = work->worker; unsigned long flags; int wake = 0; if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; spin_lock_irqsave(&worker->lock, flags); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) @@ -649,9 +647,6 @@ int btrfs_requeue_work(struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); -out: - - return 0; } void btrfs_set_work_high_prio(struct btrfs_work *work) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index f34cc31fa3c..063698b90ce 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -111,9 +111,9 @@ struct btrfs_workers { void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers); -int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); -int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22c64fff1bd..a383c18e74e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -19,19 +19,1048 @@ #include "ctree.h" #include "disk-io.h" #include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" +#include "locking.h" -struct __data_ref { - struct list_head list; +struct extent_inode_elem { u64 inum; - u64 root; - u64 extent_data_item_offset; + u64 offset; + struct extent_inode_elem *next; }; -struct __shared_ref { - struct list_head list; +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, + struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 data_offset; + u64 data_len; + struct extent_inode_elem *e; + + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = key->offset + (extent_item_pos - data_offset); + *eie = e; + + return 0; +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != wanted_disk_byte) + continue; + + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * this structure records all encountered refs on the way up to the root + */ +struct __prelim_ref { + struct list_head list; + u64 root_id; + struct btrfs_key key_for_search; + int level; + int count; + struct extent_inode_elem *inode_list; + u64 parent; + u64 wanted_disk_byte; }; +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see __add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ + +static int __add_prelim_ref(struct list_head *head, u64 root_id, + struct btrfs_key *key, int level, + u64 parent, u64 wanted_disk_byte, int count) +{ + struct __prelim_ref *ref; + + /* in case we're adding delayed refs, we're holding the refs spinlock */ + ref = kmalloc(sizeof(*ref), GFP_ATOMIC); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key_for_search = *key; + else + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + + ref->inode_list = NULL; + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, int level, + struct btrfs_key *key_for_search, u64 time_seq, + u64 wanted_disk_byte, + const u64 *extent_item_pos) +{ + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct extent_inode_elem *eie = NULL; + u64 disk_byte; + + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + return 0; + } + + /* + * We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot==nritems. In that case, go to the next leaf before we continue. + */ + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + ret = btrfs_next_old_leaf(root, path, time_seq); + + while (!ret) { + eb = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + if (extent_item_pos) { + ret = check_extent_in_eb(&key, eb, fi, + *extent_item_pos, + &eie); + if (ret < 0) + break; + } + if (!ret) { + ret = ulist_add(parents, eb->start, + (unsigned long)eie, GFP_NOFS); + if (ret < 0) + break; + if (!extent_item_pos) { + ret = btrfs_next_old_leaf(root, path, + time_seq); + continue; + } + } + } + ret = btrfs_next_old_item(root, path, time_seq); + } + + if (ret > 0) + ret = 0; + return ret; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + int search_commit_root, + u64 time_seq, + struct __prelim_ref *ref, + struct ulist *parents, + const u64 *extent_item_pos) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_key root_key; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->search_commit_root = !!search_commit_root; + + root_key.objectid = ref->root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + rcu_read_lock(); + root_level = btrfs_header_level(root->node); + rcu_read_unlock(); + + if (root_level + 1 == level) + goto out; + + path->lowest_level = level; + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " + "%d for key (%llu %u %llu)\n", + (unsigned long long)ref->root_id, level, ref->count, ret, + (unsigned long long)ref->key_for_search.objectid, + ref->key_for_search.type, + (unsigned long long)ref->key_for_search.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + while (!eb) { + if (!level) { + WARN_ON(1); + ret = 1; + goto out; + } + level--; + eb = path->nodes[level]; + } + + ret = add_all_parents(root, path, parents, level, &ref->key_for_search, + time_seq, ref->wanted_disk_byte, + extent_item_pos); +out: + btrfs_free_path(path); + return ret; +} + +/* + * resolve all indirect backrefs from the list + */ +static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + int search_commit_root, u64 time_seq, + struct list_head *head, + const u64 *extent_item_pos) +{ + int err; + int ret = 0; + struct __prelim_ref *ref; + struct __prelim_ref *ref_safe; + struct __prelim_ref *new_ref; + struct ulist *parents; + struct ulist_node *node; + struct ulist_iterator uiter; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * _safe allows us to insert directly after the current item without + * iterating over the newly inserted items. + * we're also allowed to re-assign ref during iteration. + */ + list_for_each_entry_safe(ref, ref_safe, head, list) { + if (ref->parent) /* already direct */ + continue; + if (ref->count == 0) + continue; + err = __resolve_indirect_ref(fs_info, search_commit_root, + time_seq, ref, parents, + extent_item_pos); + if (err) { + if (ret == 0) + ret = err; + continue; + } + + /* we put the first parent into the ref at hand */ + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); + ref->parent = node ? node->val : 0; + ref->inode_list = + node ? (struct extent_inode_elem *)node->aux : 0; + + /* additional parents require new refs being added here */ + while ((node = ulist_next(parents, &uiter))) { + new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); + if (!new_ref) { + ret = -ENOMEM; + break; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + new_ref->inode_list = + (struct extent_inode_elem *)node->aux; + list_add(&new_ref->list, &ref->list); + } + ulist_reinit(parents); + } + + ulist_free(parents); + return ret; +} + +static inline int ref_for_same_block(struct __prelim_ref *ref1, + struct __prelim_ref *ref2) +{ + if (ref1->level != ref2->level) + return 0; + if (ref1->root_id != ref2->root_id) + return 0; + if (ref1->key_for_search.type != ref2->key_for_search.type) + return 0; + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) + return 0; + if (ref1->key_for_search.offset != ref2->key_for_search.offset) + return 0; + if (ref1->parent != ref2->parent) + return 0; + + return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + struct list_head *pos; + struct extent_buffer *eb; + + list_for_each(pos, head) { + struct __prelim_ref *ref; + ref = list_entry(pos, struct __prelim_ref, list); + + if (ref->parent) + continue; + if (ref->key_for_search.type) + continue; + BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, + fs_info->tree_root->leafsize, 0); + BUG_ON(!eb); + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return 0; +} + +/* + * merge two lists of backrefs and adjust counts accordingly + * + * mode = 1: merge identical keys, if key is set + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + * additionally, we could even add a key range for the blocks we + * looked into to merge even more (-> replace unresolved refs by those + * having a parent). + * mode = 2: merge identical parents + */ +static int __merge_refs(struct list_head *head, int mode) +{ + struct list_head *pos1; + + list_for_each(pos1, head) { + struct list_head *n2; + struct list_head *pos2; + struct __prelim_ref *ref1; + + ref1 = list_entry(pos1, struct __prelim_ref, list); + + for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; + pos2 = n2, n2 = pos2->next) { + struct __prelim_ref *ref2; + struct __prelim_ref *xchg; + + ref2 = list_entry(pos2, struct __prelim_ref, list); + + if (mode == 1) { + if (!ref_for_same_block(ref1, ref2)) + continue; + if (!ref1->parent && ref2->parent) { + xchg = ref1; + ref1 = ref2; + ref2 = xchg; + } + ref1->count += ref2->count; + } else { + if (ref1->parent != ref2->parent) + continue; + ref1->count += ref2->count; + } + list_del(&ref2->list); + kfree(ref2); + } + + } + return 0; +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, + struct list_head *prefs) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + struct rb_node *n = &head->node.rb_node; + struct btrfs_key key; + struct btrfs_key op_key = {0}; + int sgn; + int ret = 0; + + if (extent_op && extent_op->update_key) + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); + + while ((n = rb_prev(n))) { + struct btrfs_delayed_ref_node *node; + node = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + if (node->bytenr != head->node.bytenr) + break; + WARN_ON(node->is_head); + + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + default: + BUG_ON(1); + } + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, &op_key, + ref->level + 1, 0, node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, NULL, + ref->level + 1, ref->parent, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ref->parent, node->bytenr, + node->ref_mod * sgn); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return 0; +} + +/* + * add all inline backrefs for bytenr to the list + */ +static int __add_inline_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int *info_level, struct list_head *prefs) +{ + int ret = 0; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0]; + + item_size = btrfs_item_size_nr(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, NULL, + *info_level + 1, offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, offset, NULL, + *info_level + 1, 0, + bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + */ +static int __add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int info_level, struct list_head *prefs) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, NULL, + info_level + 1, key.offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, key.offset, NULL, + info_level + 1, 0, + bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return ret; +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 delayed_ref_seq, u64 time_seq, + struct ulist *refs, struct ulist *roots, + const u64 *extent_item_pos) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head; + int info_level = 0; + int ret; + int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); + struct list_head prefs_delayed; + struct list_head prefs; + struct __prelim_ref *ref; + + INIT_LIST_HEAD(&prefs); + INIT_LIST_HEAD(&prefs_delayed); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->search_commit_root = !!search_commit_root; + + /* + * grab both a lock on the path and a lock on the delayed ref head. + * We need both to get a consistent picture of how the refs look + * at a specified point in time + */ +again: + head = NULL; + + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { + /* + * look if there are updates for this ref queued and lock the + * head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + ret = __add_delayed_refs(head, delayed_ref_seq, + &prefs_delayed); + mutex_unlock(&head->mutex); + if (ret) { + spin_unlock(&delayed_refs->lock); + goto out; + } + } + spin_unlock(&delayed_refs->lock); + } + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + path->slots[0]--; + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = __add_inline_refs(fs_info, path, bytenr, + &info_level, &prefs); + if (ret) + goto out; + ret = __add_keyed_refs(fs_info, path, bytenr, + info_level, &prefs); + if (ret) + goto out; + } + } + btrfs_release_path(path); + + list_splice_init(&prefs_delayed, &prefs); + + ret = __add_missing_keys(fs_info, &prefs); + if (ret) + goto out; + + ret = __merge_refs(&prefs, 1); + if (ret) + goto out; + + ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, + &prefs, extent_item_pos); + if (ret) + goto out; + + ret = __merge_refs(&prefs, 2); + if (ret) + goto out; + + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + if (ref->count < 0) + WARN_ON(1); + if (ref->count && ref->root_id && ref->parent == 0) { + /* no parent == root of tree */ + ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + if (ref->count && ref->parent) { + struct extent_inode_elem *eie = NULL; + if (extent_item_pos && !ref->inode_list) { + u32 bsz; + struct extent_buffer *eb; + bsz = btrfs_level_size(fs_info->extent_root, + info_level); + eb = read_tree_block(fs_info->extent_root, + ref->parent, bsz, 0); + BUG_ON(!eb); + ret = find_extent_in_eb(eb, bytenr, + *extent_item_pos, &eie); + ref->inode_list = eie; + free_extent_buffer(eb); + } + ret = ulist_add_merge(refs, ref->parent, + (unsigned long)ref->inode_list, + (unsigned long *)&eie, GFP_NOFS); + if (!ret && extent_item_pos) { + /* + * we've recorded that parent, so we must extend + * its inode list here + */ + BUG_ON(!eie); + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } + BUG_ON(ret < 0); + } + kfree(ref); + } + +out: + btrfs_free_path(path); + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + kfree(ref); + } + while (!list_empty(&prefs_delayed)) { + ref = list_first_entry(&prefs_delayed, struct __prelim_ref, + list); + list_del(&ref->list); + kfree(ref); + } + + return ret; +} + +static void free_leaf_list(struct ulist *blocks) +{ + struct ulist_node *node = NULL; + struct extent_inode_elem *eie; + struct extent_inode_elem *eie_next; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(blocks, &uiter))) { + if (!node->aux) + continue; + eie = (struct extent_inode_elem *)node->aux; + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } + node->aux = 0; + } + + ulist_free(blocks); +} + +/* + * Finds all leafs with a reference to the specified combination of bytenr and + * offset. key_list_head will point to a list of corresponding keys (caller must + * free each list element). The leafs will be stored in the leafs ulist, which + * must be freed with ulist_free. + * + * returns 0 on success, <0 on error + */ +static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 delayed_ref_seq, u64 time_seq, + struct ulist **leafs, + const u64 *extent_item_pos) +{ + struct ulist *tmp; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *leafs = ulist_alloc(GFP_NOFS); + if (!*leafs) { + ulist_free(tmp); + return -ENOMEM; + } + + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, *leafs, tmp, extent_item_pos); + ulist_free(tmp); + + if (ret < 0 && ret != -ENOENT) { + free_leaf_list(*leafs); + return ret; + } + + return 0; +} + +/* + * walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. Found roots are added to the roots list. + * + * returns 0 on success, < 0 on error. + */ +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots) +{ + struct ulist *tmp; + struct ulist_node *node = NULL; + struct ulist_iterator uiter; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *roots = ulist_alloc(GFP_NOFS); + if (!*roots) { + ulist_free(tmp); + return -ENOMEM; + } + + ULIST_ITER_INIT(&uiter); + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, tmp, *roots, NULL); + if (ret < 0 && ret != -ENOENT) { + ulist_free(tmp); + ulist_free(*roots); + return ret; + } + node = ulist_next(tmp, &uiter); + if (!node) + break; + bytenr = node->val; + } + + ulist_free(tmp); + return 0; +} + + static int __inode_info(u64 inum, u64 ioff, u8 key_type, struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_key *found_key) @@ -108,19 +1137,25 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = size - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; if (bytes_left >= 0) dest[bytes_left] = '\0'; + path->leave_spinning = 1; while (1) { len = btrfs_inode_ref_name_len(eb, iref); bytes_left -= len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, (unsigned long)(iref + 1), len); - if (eb != eb_in) + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + } ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret > 0) + ret = -ENOENT; if (ret) break; next_inum = found_key.offset; @@ -132,8 +1167,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, slot = path->slots[0]; eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ - if (eb != eb_in) + if (eb != eb_in) { atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); @@ -144,6 +1182,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); + path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); @@ -181,8 +1220,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type != BTRFS_EXTENT_ITEM_KEY || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) + found_key->objectid + found_key->offset <= logical) { + pr_debug("logical %llu is not within any extent\n", + (unsigned long long)logical); return -ENOENT; + } eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -191,6 +1233,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); + pr_debug("logical %llu is at position %llu within the extent (%llu " + "EXTENT_ITEM %llu) flags %#llx size %u\n", + (unsigned long long)logical, + (unsigned long long)(logical - found_key->objectid), + (unsigned long long)found_key->objectid, + (unsigned long long)found_key->offset, + (unsigned long long)flags, item_size); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return BTRFS_EXTENT_FLAG_TREE_BLOCK; if (flags & BTRFS_EXTENT_FLAG_DATA) @@ -287,295 +1336,103 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int __data_list_add(struct list_head *head, u64 inum, - u64 extent_data_item_offset, u64 root) -{ - struct __data_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->inum = inum; - ref->extent_data_item_offset = extent_data_item_offset; - ref->root = root; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, - struct btrfs_extent_data_ref *dref) -{ - return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), - btrfs_extent_data_ref_offset(eb, dref), - btrfs_extent_data_ref_root(eb, dref)); -} - -static int __shared_list_add(struct list_head *head, u64 disk_byte) -{ - struct __shared_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->disk_byte = disk_byte; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, - u64 logical, u64 inum, - u64 extent_data_item_offset, - u64 extent_offset, - struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) -{ - u64 ref_root; - u32 item_size; - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *eiref; - struct __data_ref *ref; - int ret; - int type; - int last; - unsigned long ptr = 0; - - WARN_ON(!list_empty(data_refs)); - ret = extent_from_logical(fs_info, logical, path, &key); - if (ret & BTRFS_EXTENT_FLAG_DATA) - ret = -EIO; - if (ret < 0) - goto out; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - ret = 0; - ref_root = 0; - /* - * as done in iterate_extent_inodes, we first build a list of refs to - * iterate, then free the path and then iterate them to avoid deadlocks. - */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last < 0) { - ret = last; - goto out; - } - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) { - ref_root = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __data_list_add(data_refs, inum, - extent_data_item_offset, - ref_root); - } - } while (!ret && !last); - - btrfs_release_path(path); - - if (ref_root == 0) { - printk(KERN_ERR "btrfs: failed to find tree block ref " - "for shared data backref %llu\n", logical); - WARN_ON(1); - ret = -EIO; - } - -out: - while (!list_empty(data_refs)) { - ref = list_first_entry(data_refs, struct __data_ref, list); - list_del(&ref->list); - if (!ret) - ret = iterate(ref->inum, extent_offset + - ref->extent_data_item_offset, - ref->root, ctx); - kfree(ref); - } - - return ret; -} - -static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, - u64 logical, u64 orig_extent_item_objectid, - u64 extent_offset, struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, + iterate_extent_inodes_t *iterate, void *ctx) { - u64 disk_byte; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *eb; - int slot; - int nritems; - int ret; - int found = 0; - - eb = read_tree_block(fs_info->tree_root, logical, - fs_info->tree_root->leafsize, 0); - if (!eb) - return -EIO; - - /* - * from the shared data ref, we only have the leaf but we need - * the key. thus, we must look into all items and see that we - * find one (some) with a reference to our extent item. - */ - nritems = btrfs_header_nritems(eb); - for (slot = 0; slot < nritems; ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - if (!fi) { - free_extent_buffer(eb); - return -EIO; - } - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) { - if (found) - break; - else - continue; - } - ++found; - ret = __iter_shared_inline_ref_inodes(fs_info, logical, - key.objectid, - key.offset, - extent_offset, path, - data_refs, - iterate, ctx); - if (ret) + struct extent_inode_elem *eie; + int ret = 0; + + for (eie = inode_list; eie; eie = eie->next) { + pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " + "root %llu\n", extent_item_objectid, + eie->inum, eie->offset, root); + ret = iterate(eie->inum, eie->offset, root, ctx); + if (ret) { + pr_debug("stopping iteration for %llu due to ret=%d\n", + extent_item_objectid, ret); break; + } } - if (!found) { - printk(KERN_ERR "btrfs: failed to follow shared data backref " - "to parent %llu\n", logical); - WARN_ON(1); - ret = -EIO; - } - - free_extent_buffer(eb); return ret; } /* * calls iterate() for every inode that references the extent identified by - * the given parameters. will use the path given as a parameter and return it - * released. + * the given parameters. * when the iterator function returns a non-zero value, iteration stops. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - u64 extent_item_objectid, - u64 extent_offset, + u64 extent_item_objectid, u64 extent_item_pos, + int search_commit_root, iterate_extent_inodes_t *iterate, void *ctx) { - unsigned long ptr = 0; - int last; int ret; - int type; - u64 logical; - u32 item_size; - struct btrfs_extent_inline_ref *eiref; - struct btrfs_extent_data_ref *dref; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_key key; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct __data_ref *ref_d; - struct __shared_ref *ref_s; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - /* first we iterate the inline refs, ... */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last == -ENOENT) { - ret = 0; - break; - } - if (last < 0) { - ret = last; - break; - } + struct btrfs_trans_handle *trans; + struct ulist *refs = NULL; + struct ulist *roots = NULL; + struct ulist_node *ref_node = NULL; + struct ulist_node *root_node = NULL; + struct seq_list seq_elem = {}; + struct seq_list tree_mod_seq_elem = {}; + struct ulist_iterator ref_uiter; + struct ulist_iterator root_uiter; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + + pr_debug("resolving all inodes for extent %llu\n", + extent_item_objectid); + + if (search_commit_root) { + trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; + } else { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + btrfs_get_delayed_seq(delayed_refs, &seq_elem); + spin_unlock(&delayed_refs->lock); + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&eiref->offset); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - logical = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __shared_list_add(&shared_refs, logical); - } - } while (!ret && !last); + ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, + seq_elem.seq, tree_mod_seq_elem.seq, &refs, + &extent_item_pos); + if (ret) + goto out; - /* ... then we proceed to in-tree references and ... */ - while (!ret) { - ++path->slots[0]; - if (path->slots[0] > btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret) { - if (ret == 1) - ret = 0; /* we're done */ - break; - } - eb = path->nodes[0]; - } - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_item_objectid) + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, + seq_elem.seq, + tree_mod_seq_elem.seq, &roots); + if (ret) break; - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __shared_list_add(&shared_refs, key.offset); + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { + pr_debug("root %llu references leaf %llu, data list " + "%#lx\n", root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs( + (struct extent_inode_elem *)ref_node->aux, + root_node->val, extent_item_objectid, + iterate, ctx); } + ulist_free(roots); + roots = NULL; } - btrfs_release_path(path); - - /* - * ... only at the very end we can process the refs we found. this is - * because the iterator function we call is allowed to make tree lookups - * and we have to avoid deadlocks. additionally, we need more tree - * lookups ourselves for shared data refs. - */ - while (!list_empty(&data_refs)) { - ref_d = list_first_entry(&data_refs, struct __data_ref, list); - list_del(&ref_d->list); - if (!ret) - ret = iterate(ref_d->inum, extent_offset + - ref_d->extent_data_item_offset, - ref_d->root, ctx); - kfree(ref_d); - } - - while (!list_empty(&shared_refs)) { - ref_s = list_first_entry(&shared_refs, struct __shared_ref, - list); - list_del(&ref_s->list); - if (!ret) - ret = __iter_shared_inline_ref(fs_info, - ref_s->disk_byte, - extent_item_objectid, - extent_offset, path, - &data_refs, - iterate, ctx); - kfree(ref_s); + free_leaf_list(refs); + ulist_free(roots); +out: + if (!search_commit_root) { + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_put_delayed_seq(delayed_refs, &seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); } return ret; @@ -586,19 +1443,22 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - u64 offset; + u64 extent_item_pos; struct btrfs_key found_key; + int search_commit_root = path->search_commit_root; ret = extent_from_logical(fs_info, logical, path, &found_key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -EINVAL; if (ret < 0) return ret; - offset = logical - found_key.objectid; - ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - offset, iterate, ctx); + extent_item_pos = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, search_commit_root, + iterate, ctx); return ret; } @@ -607,7 +1467,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_path *path, iterate_irefs_t *iterate, void *ctx) { - int ret; + int ret = 0; int slot; u32 cur; u32 len; @@ -619,7 +1479,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_inode_ref *iref; struct btrfs_key found_key; - while (1) { + while (!ret) { + path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -635,6 +1496,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(eb, slot); @@ -643,14 +1506,17 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ + pr_debug("following ref at offset %u for inode %llu in " + "tree %llu\n", cur, + (unsigned long long)found_key.objectid, + (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); - if (ret) { - free_extent_buffer(eb); + if (ret) break; - } len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } @@ -683,10 +1549,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { + pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { + pr_debug("missed path, not enough space. missing bytes: %lu, " + "constructed so far: %s\n", + (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; @@ -711,12 +1581,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) inode_to_path, ipath); } -/* - * allocates space to return multiple file system paths for an inode. - * total_bytes to allocate are passed, note that space usable for actual path - * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. - */ struct btrfs_data_container *init_data_container(u32 total_bytes) { struct btrfs_data_container *data; @@ -772,5 +1636,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + if (!ipath) + return; + kfree(ipath->fspath); kfree(ipath); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 92618837cb8..c18d8ac7b79 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -20,6 +20,9 @@ #define __BTRFS_BACKREF__ #include "ioctl.h" +#include "ulist.h" + +#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -43,9 +46,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 extent_item_objectid, - u64 extent_offset, + u64 extent_offset, int search_commit_root, iterate_extent_inodes_t *iterate, void *ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, @@ -54,6 +56,11 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots); + struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 634608d2a6d..12394a90d60 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -24,6 +24,21 @@ #include "ordered-data.h" #include "delayed-inode.h" +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 +#define BTRFS_INODE_ORPHAN_META_RESERVED 1 +#define BTRFS_INODE_DUMMY 2 +#define BTRFS_INODE_IN_DEFRAG 3 +#define BTRFS_INODE_DELALLOC_META_RESERVED 4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 + /* in memory btrfs inode */ struct btrfs_inode { /* which subvolume this inode belongs to */ @@ -51,12 +66,12 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* held while doing delalloc reservations */ + struct mutex delalloc_mutex; + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; - /* for keeping track of orphaned inodes */ - struct list_head i_orphan; - /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. @@ -75,14 +90,13 @@ struct btrfs_inode { /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; + unsigned long runtime_flags; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ u64 generation; - /* sequence number for NFS changes */ - u64 sequence; - /* * transid of the trans_handle that last modified this inode */ @@ -142,22 +156,9 @@ struct btrfs_inode { unsigned reserved_extents; /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - */ - unsigned ordered_data_close:1; - unsigned orphan_meta_reserved:1; - unsigned dummy_inode:1; - unsigned in_defrag:1; - unsigned delalloc_meta_reserved:1; - - /* * always compress this one file */ - unsigned force_compress:4; + unsigned force_compress; struct btrfs_delayed_node *delayed_node; @@ -199,4 +200,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, return false; } +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == generation && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 00000000000..da6e9364a5e --- /dev/null +++ b/fs/btrfs/check-integrity.c @@ -0,0 +1,3358 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and a FLUSH request to the device where + * these blocks are located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/crc32c.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "extent_io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" +#include "rcu-string.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num:2; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_bh_private; + union { + bio_end_io_t *bio; + bh_end_io_t *bh; + } orig_bio_bh_end_io; + int submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block refered from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; + char name[BDEVNAME_SIZE]; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char **datav; + struct page **pagev; + void *mem_to_free; +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + int csum_size; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_root *root; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; +}; + +static void btrfsic_block_init(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_alloc(void); +static void btrfsic_block_free(struct btrfsic_block *b); +static void btrfsic_block_link_init(struct btrfsic_block_link *n); +static struct btrfsic_block_link *btrfsic_block_link_alloc(void); +static void btrfsic_block_link_free(struct btrfsic_block_link *n); +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h); +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices); +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static void btrfsic_dump_database(struct btrfsic_state *state); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err); +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + char **datav, unsigned int num_pages); +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_bh_private = NULL; + b->orig_bio_bh_end_io.bio = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + ds->name[0] = '\0'; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block *const b = + list_entry(elem, struct btrfsic_block, + collision_resolving_node); + + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block_link *const l = + list_entry(elem, struct btrfsic_block_link, + collision_resolving_node); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_dev_state *const ds = + list_entry(elem, struct btrfsic_dev_state, + collision_resolving_node); + + if (ds->bdev == bdev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + int ret = 0; + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int pass; + + BUG_ON(NULL == state); + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); + if (NULL == selected_super) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return -1; + } + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + printk(KERN_INFO "btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + state->csum_size = btrfs_super_csum_size(selected_super); + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(root @%llu," + " mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)PAGE_CACHE_SIZE) { + printk(KERN_INFO + "btrfsic: read @logical %llu failed!\n", + (unsigned long long) + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct buffer_head *bh; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + return -1; + bh = __bread(superblock_bdev, dev_bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (NULL == bh) + return -1; + super_tmp = (struct btrfs_super_block *) + (bh->b_data + (dev_bytenr & 4095)); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, + sizeof(super_tmp->magic)) || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_leafsize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { + brelse(bh); + return 0; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + brelse(bh); + return -1; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, + rcu_str_deref(device->name), + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, + &tmp_next_block_ctx, + mirror_num)) { + printk(KERN_INFO "btrfsic: btrfsic_map_block(" + "bytenr @%llu, mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + brelse(bh); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + brelse(bh); + return -1; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + brelse(bh); + return -1; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + + brelse(bh); + return 0; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (NULL == sf) + printk(KERN_INFO "btrfsic: alloc memory failed!\n"); + else + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + + BUG_ON(!first_hdr); + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = le64_to_cpu(sf->hdr->generation); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(leafhdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "leaf %llu items %d generation %llu" + " owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + sf->nr, + (unsigned long long) + le64_to_cpu(leafhdr->header.generation), + (unsigned long long) + le64_to_cpu(leafhdr->header.owner)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; + u8 type; + u32 item_offset; + + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + printk(KERN_INFO + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = le32_to_cpu(disk_item.offset); + disk_key = &disk_item.key; + type = disk_key->type; + + if (BTRFS_ROOT_ITEM_KEY == type) { + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + + sizeof(struct btrfs_root_item) > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + sizeof(struct btrfs_root_item)); + next_bytenr = le64_to_cpu(root_item.bytenr); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + le64_to_cpu(root_item. + generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(nodehdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "node %llu level %d items %d" + " generation %llu owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + (unsigned long long) + le64_to_cpu(nodehdr->header.generation), + (unsigned long long) + le64_to_cpu(nodehdr->header.owner)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + printk(KERN_INFO + "btrfsic: node item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = le64_to_cpu(key_ptr.blockptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &key_ptr.key, + le64_to_cpu(key_ptr.generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.datav[0]; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) + goto one_stack_frame_backwards; + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t offset_in_page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + offset_in_page = (start_offset + offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); + BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + offset_in_page, cur); + + dst += cur; + len -= cur; + offset_in_page = 0; + i++; + } +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, state->metablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, + next_block_ctx, *mirror_nump); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + (unsigned long long)next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block), + (unsigned long long)next_block->logical_bytenr); + } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block)); + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)next_block_ctx->len) { + printk(KERN_INFO + "btrfsic: read block @logical %llu failed!\n", + (unsigned long long)next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + int ret; + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; + struct btrfsic_block_link *l; + + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + (unsigned long long) + le64_to_cpu(file_extent_item.disk_bytenr)); + return 0; + } + + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + + le64_to_cpu(file_extent_item.offset); + generation = le64_to_cpu(file_extent_item.generation); + num_bytes = le64_to_cpu(file_extent_item.num_bytes); + generation = le64_to_cpu(file_extent_item.generation); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," + " offset = %llu, num_bytes = %llu\n", + file_extent_item.type, + (unsigned long long) + le64_to_cpu(file_extent_item.disk_bytenr), + (unsigned long long)le64_to_cpu(file_extent_item.offset), + (unsigned long long)num_bytes); + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; + else + chunk_len = num_bytes; + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, state->datablock_size); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "btrfsic_handle_extent_data(" + "mirror_num=%d)\n", mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO + "\tdisk_bytenr = %llu, num_bytes %u\n", + (unsigned long long)next_bytenr, + chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block" + " @%llu (%s/%llu/%d)" + " found in hash table, D," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx.dev->name, + (unsigned long long) + next_block_ctx.dev_bytenr, + mirror_num, + (unsigned long long) + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + int ret; + u64 length; + struct btrfs_bio *multi = NULL; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + bytenr, &length, &multi, mirror_num); + + device = multi->stripes[0].dev; + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + + if (0 == ret) + kfree(multi); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out) +{ + block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); + block_ctx_out->dev_bytenr = bytenr; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; + if (NULL != block_ctx_out->dev) { + return 0; + } else { + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); + return -ENXIO; + } +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) { + kunmap(block_ctx->pagev[num_pages]); + block_ctx->datav[num_pages] = NULL; + } + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + unsigned int num_pages; + unsigned int i; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: read_block() with unaligned bytenr %llu\n", + (unsigned long long)block_ctx->dev_bytenr); + return -1; + } + + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev)) * + num_pages, GFP_NOFS); + if (!block_ctx->mem_to_free) + return -1; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + for (i = 0; i < num_pages; i++) { + block_ctx->pagev[i] = alloc_page(GFP_NOFS); + if (!block_ctx->pagev[i]) + return -1; + } + + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, num_pages - i); + if (!bio) { + printk(KERN_INFO + "btrfsic: bio_alloc() for %u pages failed!\n", + num_pages - i); + return -1; + } + bio->bi_bdev = block_ctx->dev->bdev; + bio->bi_sector = dev_bytenr >> 9; + bio->bi_end_io = btrfsic_complete_bio_end_io; + bio->bi_private = &complete; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_CACHE_SIZE, 0); + if (PAGE_CACHE_SIZE != ret) + break; + } + if (j == i) { + printk(KERN_INFO + "btrfsic: error, failed to add a single page!\n"); + return -1; + } + submit_bio(READ, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_INFO + "btrfsic: read error at logical %llu dev %s!\n", + block_ctx->start, block_ctx->dev->name); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) { + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + if (!block_ctx->datav[i]) { + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", + block_ctx->dev->name); + return -1; + } + } + + return block_ctx->len; +} + +static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + struct list_head *elem_all; + + BUG_ON(NULL == state); + + printk(KERN_INFO "all_blocks_list:\n"); + list_for_each(elem_all, &state->all_blocks_list) { + const struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *elem_ref_from; + + printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + + list_for_each(elem_ref_to, &b_all->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " refers %u* to" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each(elem_ref_from, &b_all->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, + struct btrfsic_block_link, + node_ref_from); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " is ref %u* from" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + printk(KERN_INFO "\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + char **datav, unsigned int num_pages) +{ + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + unsigned int i; + + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + h = (struct btrfs_header *)datav[0]; + + if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + return 1; + + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_CACHE_SIZE : + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + + crc = crc32c(crc, data, sublen); + } + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, state->csum_size)) + return 1; + + return 0; /* is metadata */ +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; + + if (NULL != bio_is_patched) + *bio_is_patched = 0; + +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr = 0; + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + if (block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_super_block *) + mapped_datav[0])->bytenr); + if (num_pages * PAGE_CACHE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + is_metadata = 1; + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + processed_len = BTRFS_SUPER_INFO_SIZE; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + printk(KERN_INFO + "[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + if (num_pages * PAGE_CACHE_SIZE < + state->metablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_datav[0])->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr); + } + if (block->logical_bytenr != bytenr) { + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block), + (unsigned long long) + block->logical_bytenr); + block->logical_bytenr = bytenr; + } else if (state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } else { + if (num_pages * PAGE_CACHE_SIZE < + state->datablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), old(gen=%llu," + " objectid=%llu, type=%d, offset=%llu)," + " new(gen=%llu)," + " which is referenced by most recent superblock" + " (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(block->disk_key.objectid), + block->disk_key.type, + (unsigned long long) + le64_to_cpu(block->disk_key.offset), + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_datav[0])->generation), + (unsigned long long) + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," + " which is not yet iodone!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_datav[0])->generation)); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + goto continue_loop; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valueable information + * like whether it was ever written and IO completed. + */ + list_for_each_safe(elem_ref_to, tmp_ref_to, + &block->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + if (block->is_superblock) + ret = btrfsic_map_superblock(state, bytenr, + processed_len, + bdev, &block_ctx); + else + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", (unsigned long long)bytenr); + goto continue_loop; + } + block_ctx.datav = mapped_datav; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = + bio->bi_private; + block->orig_bio_bh_end_io.bio = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io. + bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + BUG_ON(PAGE_CACHE_SIZE != + BTRFS_SUPER_INFO_SIZE); + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_datav[0]); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + printk(KERN_INFO + "[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + 0, 0); + } + if (ret) + printk(KERN_INFO + "btrfsic: btrfsic_process_metablock" + "(root @%llu) failed!\n", + (unsigned long long)dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + processed_len = state->datablock_size; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "Written block (%s/%llu/?)" + " !found in hash table, D.\n", + dev_state->name, + (unsigned long long)dev_bytenr); + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.mem_to_free = NULL; + block_ctx.pagev = NULL; + } else { + processed_len = state->metablock_size; + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_datav[0])->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/?)" + " !found in hash table, M.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr); + + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + goto continue_loop; + } + } + block_ctx.datav = mapped_datav; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&block_ctx); + goto continue_loop; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io.bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New written %c-block @%llu (%s/%llu/%d)\n", + is_metadata ? 'M' : 'D', + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, 0, 0); + if (ret) + printk(KERN_INFO + "btrfsic: process_metablock(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; + num_pages -= processed_len >> PAGE_CACHE_SHIFT; + goto again; +} + +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bio_error_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_bh_private; + bp->bi_end_io = block->orig_bio_bh_end_io.bio; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + bio_error_status, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long) + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp, bio_error_status); +} + +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; + int iodone_w_error = !uptodate; + struct btrfsic_dev_state *dev_state; + + BUG_ON(NULL == block); + dev_state = block->dev_state; + if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", + iodone_w_error, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long)dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is on disk */ + + bh->b_private = block->orig_bio_bh_private; + bh->b_end_io = block->orig_bio_bh_end_io.bh; + block->is_iodone = 1; /* for FLUSH, this releases the block */ + bh->b_end_io(bh, uptodate); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: superblock @%llu (%s/%llu/%d)" + " with old gen %llu <= %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: got new superblock @%llu (%s/%llu/%d)" + " with new gen %llu > %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, BTRFS_SUPER_INFO_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_process_written_superblock(" + "mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { + WARN_ON(1); + btrfsic_dump_tree(state); + } + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + struct list_head *elem_ref_to; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage informations. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " %u* refers to %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " with generation %llu !=" + " parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)l->block_ref_to->generation, + (unsigned long long)l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not flushed out of disk's write cache" + " (block flush_gen=%llu," + " dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)block->flush_gen, + (unsigned long long) + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + struct list_head *elem_ref_from; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_from, &block->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, struct btrfsic_block_link, + node_ref_from); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " is ref %u* from %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Add %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Rem %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + struct list_head *elem_ref_to; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO + "btrfsic: error, kmalloc" " failed!\n"); + return NULL; + } + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return NULL; + } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + if (NULL == dev_state) { + printk(KERN_INFO + "btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New %s%c-block @%llu (%s/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr) +{ + int num_copies; + int mirror_num; + int ret; + struct btrfsic_block_data_ctx block_ctx; + int match = 0; + + num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + bytenr, state->metablock_size); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, state->metablock_size, + &block_ctx, mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(logical @%llu," + " mirror %d) failed!\n", + (unsigned long long)bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (!match) { + printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," + " buffer->log_bytenr=%llu, submit_bio(bdev=%s," + " phys_bytenr=%llu)!\n", + (unsigned long long)bytenr, dev_state->name, + (unsigned long long)dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, + &block_ctx, mirror_num); + if (ret) + continue; + + printk(KERN_INFO "Read logical bytenr @%llu maps to" + " (%s/%llu/%d)\n", + (unsigned long long)bytenr, + block_ctx.dev->name, + (unsigned long long)block_ctx.dev_bytenr, + mirror_num); + } + WARN_ON(1); + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev) +{ + struct btrfsic_dev_state *ds; + + ds = btrfsic_dev_state_hashtable_lookup(bdev, + &btrfsic_dev_state_hashtable); + return ds; +} + +int btrfsic_submit_bh(int rw, struct buffer_head *bh) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return submit_bh(rw, bh); + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bh() might also be called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + + /* Only called to write the superblock (incl. FLUSH/FUA) */ + if (NULL != dev_state && + (rw & WRITE) && bh->b_size > 0) { + u64 dev_bytenr; + + dev_bytenr = 4096 * bh->b_blocknr; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," + " size=%lu, data=%p, bdev=%p)\n", + rw, (unsigned long)bh->b_blocknr, + (unsigned long long)dev_bytenr, + (unsigned long)bh->b_size, bh->b_data, + bh->b_bdev); + btrfsic_process_written_block(dev_state, dev_bytenr, + &bh->b_data, 1, NULL, + NULL, bh, rw); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", + rw, bh->b_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bh(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + return submit_bh(rw, bh); +} + +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) { + submit_bio(rw, bio); + return; + } + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bio() is also called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + if (NULL != dev_state && + (rw & WRITE) && NULL != bio->bi_io_vec) { + unsigned int i; + u64 dev_bytenr; + int bio_is_patched; + char **mapped_datav; + + dev_bytenr = 512 * bio->bi_sector; + bio_is_patched = 0; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x, bi_vcnt=%u," + " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, + (unsigned long long)dev_bytenr, + bio->bi_bdev); + + mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, + GFP_NOFS); + if (!mapped_datav) + goto leave; + for (i = 0; i < bio->bi_vcnt; i++) { + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); + if (!mapped_datav[i]) { + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + goto leave; + } + if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE) == + (dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "#%u: page=%p, len=%u, offset=%u\n", + i, bio->bi_io_vec[i].bv_page, + bio->bi_io_vec[i].bv_len, + bio->bi_io_vec[i].bv_offset); + } + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_datav, bio->bi_vcnt, + bio, &bio_is_patched, + NULL, rw); + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", + rw, bio->bi_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bio(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } + } +leave: + mutex_unlock(&btrfsic_mutex); + + submit_bio(rw, bio); +} + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (root->nodesize != root->leafsize) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d != leafsize %d!\n", + root->nodesize, root->leafsize); + return -1; + } + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->nodesize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->leafsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + state = kzalloc(sizeof(*state), GFP_NOFS); + if (NULL == state) { + printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); + return -1; + } + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->root = root; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->csum_size = 0; + state->metablock_size = root->nodesize; + state->datablock_size = root->sectorsize; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + char *p; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + printk(KERN_INFO + "btrfs check-integrity: kmalloc() failed!\n"); + mutex_unlock(&btrfsic_mutex); + return -1; + } + ds->bdev = device->bdev; + ds->state = state; + bdevname(ds->bdev, ds->name); + ds->name[BDEVNAME_SIZE - 1] = '\0'; + for (p = ds->name; *p != '\0'; p++); + while (p > ds->name && *p != '/') + p--; + if (*p == '/') + p++; + strlcpy(ds->name, p, sizeof(ds->name)); + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(root, fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices) +{ + struct list_head *elem_all; + struct list_head *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + printk(KERN_INFO + "btrfsic: error, cannot find state information" + " on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { + struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + list_for_each_safe(elem_ref_to, tmp_ref_to, + &b_all->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone || b_all->never_written) + btrfsic_block_free(b_all); + else + printk(KERN_INFO "btrfs: attempt to free %c-block" + " @%llu (%s/%llu/%d) on umount which is" + " not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kfree(state); +} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 00000000000..8b59175cc50 --- /dev/null +++ b/fs/btrfs/check-integrity.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_CHECK_INTEGRITY__) +#define __BTRFS_CHECK_INTEGRITY__ + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +int btrfsic_submit_bh(int rw, struct buffer_head *bh); +void btrfsic_submit_bio(int rw, struct bio *bio); +#else +#define btrfsic_submit_bh submit_bh +#define btrfsic_submit_bio submit_bio +#endif + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices); + +#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d..86eff48dab7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -120,10 +120,10 @@ static int check_compressed_csum(struct inode *inode, page = cb->compressed_pages[i]; csum = ~(u32)0; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (csum != *cb_sum) { printk(KERN_INFO "btrfs csum failed ino %llu " @@ -226,8 +226,8 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline int end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; @@ -253,7 +253,6 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start, index += ret; } /* the inode may be gone now */ - return 0; } /* @@ -392,20 +391,21 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, */ atomic_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + BUG_ON(!bio); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -421,15 +421,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_get(bio); ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(bio); return 0; @@ -497,7 +497,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * sure they map to this compressed extent on disk. */ set_page_extent_mapped(page); - lock_extent(tree, last_offset, end, GFP_NOFS); + lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); @@ -507,7 +507,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -521,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (zero_offset) { int zeros; zeros = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page, KM_USER0); + userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, zeros); flush_dcache_page(page); - kunmap_atomic(userpage, KM_USER0); + kunmap_atomic(userpage); } } @@ -535,7 +535,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, nr_pages++; page_cache_release(page); } else { - unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_extent(tree, last_offset, end); unlock_page(page); page_cache_release(page); break; @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); @@ -660,7 +662,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ /* * inc the count before we submit the bio so @@ -673,19 +675,20 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + BUG_ON(!comp_bio); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -696,15 +699,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); + BUG_ON(ret); /* -ENOMEM */ bio_put(comp_bio); return 0; @@ -732,7 +735,7 @@ struct btrfs_compress_op *btrfs_compress_op[] = { &btrfs_lzo_compress, }; -int __init btrfs_init_compress(void) +void __init btrfs_init_compress(void) { int i; @@ -742,7 +745,6 @@ int __init btrfs_init_compress(void) atomic_set(&comp_alloc_workspace[i], 0); init_waitqueue_head(&comp_workspace_wait[i]); } - return 0; } /* @@ -991,9 +993,9 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(PAGE_CACHE_SIZE - *pg_offset, PAGE_CACHE_SIZE - buf_offset); bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); + kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); flush_dcache_page(page_out); *pg_offset += bytes; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index a12059f4f0f..9afb0a62ae8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_init_compress(void); +void btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(int type, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dede441bdee..8206b390058 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -36,8 +37,17 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot, + int tree_mod_log); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid, + u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 time_seq); struct btrfs_path *btrfs_alloc_path(void) { @@ -156,10 +166,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - rcu_read_lock(); - eb = rcu_dereference(root->node); - extent_buffer_get(eb); - rcu_read_unlock(); + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -207,10 +230,12 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + spin_lock(&root->fs_info->trans_lock); if (root->track_dirty && list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } + spin_unlock(&root->fs_info->trans_lock); } /* @@ -261,9 +286,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); if (ret) return ret; @@ -273,6 +298,449 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + struct seq_list elem; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + elem->flags = 1; + spin_lock(&fs_info->tree_mod_seq_lock); + __get_tree_mod_seq(fs_info, elem); + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + BUG_ON(!(elem->flags & 1)); + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + goto out; + } + min_seq = cur_elem->seq; + } + } + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->elem.seq > min_seq) + continue; + rb_erase(node, tm_root); + list_del(&tm->elem.list); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +out: + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + int ret = 0; + + BUG_ON(!tm || !tm->elem.seq); + + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->elem.seq < tm->elem.seq) + new = &((*new)->rb_left); + else if (cur->elem.seq > tm->elem.seq) + new = &((*new)->rb_right); + else { + kfree(tm); + ret = -EEXIST; + goto unlock; + } + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); +unlock: + write_unlock(&fs_info->tree_mod_log_lock); + return ret; +} + +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 1; + if (!eb) + return 0; + if (btrfs_header_level(eb) == 0) + return 1; + return 0; +} + +/* + * This allocates memory and gets a tree modification sequence number when + * needed. + * + * Returns 0 when no sequence number is needed, < 0 on error. + * Returns 1 when a sequence number was added. In this case, + * fs_info->tree_mod_seq_lock was acquired and must be released by the caller + * after inserting into the rb tree. + */ +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, + struct tree_mod_elem **tm_ret) +{ + struct tree_mod_elem *tm; + int seq; + + if (tree_mod_dont_log(fs_info, NULL)) + return 0; + + tm = *tm_ret = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; + + tm->elem.flags = 0; + spin_lock(&fs_info->tree_mod_seq_lock); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list, because no blocker exists. items + * are removed from the list only when the existing blocker is + * removed from the list. + */ + kfree(tm); + seq = 0; + spin_unlock(&fs_info->tree_mod_seq_lock); + } else { + __get_tree_mod_seq(fs_info, &tm->elem); + seq = tm->elem.seq; + } + + return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + int slot, enum mod_log_op op) +{ + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + int i; + + if (tree_mod_dont_log(fs_info, eb)) + return 0; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING); + BUG_ON(ret < 0); + } + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->elem.seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->elem.seq > cur->elem.seq); + found = cur; + node = node->rb_left; + } else if (cur->elem.seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->elem.seq < cur->elem.seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + struct extent_buffer *src, unsigned long dst_offset, + unsigned long src_offset, int nr_items) +{ + int ret; + int i; + + if (tree_mod_dont_log(fs_info, NULL)) + return; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return; + + /* speed this up by single seq for all operations? */ + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items) +{ + int ret; + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, + nr_items, GFP_NOFS); + BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int slot, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + int i; + int ret; + u32 nritems; + + if (tree_mod_dont_log(fs_info, eb)) + return; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, + struct extent_buffer *new_root_node) +{ + int ret; + tree_mod_log_free_eb(root->fs_info, root->node); + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS); + BUG_ON(ret < 0); +} + /* * check if the tree block can be shared by multiple trees */ @@ -331,8 +799,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, buf->len, &refs, &flags); - BUG_ON(ret); - BUG_ON(refs == 0); + if (ret) + return ret; + if (refs == 0) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + return ret; + } } else { refs = 1; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || @@ -350,44 +823,51 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, buf, 1, 1); + BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); - BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1); - BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 0, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_inc_ref(trans, root, cow, 1, 1); + BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); + BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, new_flags, 0); - BUG_ON(ret); + if (ret) + return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1); - BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, buf, 1, 1); + BUG_ON(ret); /* -ENOMEM */ } + /* + * don't log freeing in case we're freeing the root node, this + * is done by tree_mod_log_set_root_pointer later + */ + if (buf != root->node && btrfs_header_level(buf) != 0) + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -415,7 +895,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *cow; - int level; + int level, ret; int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -467,7 +947,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow, &last_ref); + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } if (root->ref_cows) btrfs_reloc_cow_block(trans, root, buf, cow); @@ -481,6 +965,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; extent_buffer_get(cow); + tree_mod_log_set_root_pointer(root, cow); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, @@ -494,6 +979,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; WARN_ON(trans->transid != btrfs_header_generation(parent)); + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -504,12 +991,235 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = root->node->start; + int looped = 0; + + if (!time_seq) + return 0; + + /* + * the very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). this has the index of the *new* + * root, making it the very first operation that's logged for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(fs_info, root_logical, + time_seq); + if (!looped && !tm) + return 0; + /* + * if there are no tree operation for the oldest root, we simply + * return it. this should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * if there's an operation that's not a root replacement, we + * found the oldest version of our root. normally, we'll find a + * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + BUG_ON(root_logical == root->node->start); + looped = 1; + } + + /* if there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + while (tm && tm->elem.seq >= time_seq) { + /* + * all the operations are recorded with the operator used for + * the modification. as we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case MOD_LOG_KEY_ADD: + /* if a move operation is needed it's in the log */ + n--; + break; + case MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case MOD_LOG_ROOT_REPLACE: + /* + * this operation is special. for roots, this must be + * handled explicitly before rewinding. + * for non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. in the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. we simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = container_of(next, struct tree_mod_elem, node); + if (tm->index != first_tm->index) + break; + } + btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(eb->start, + fs_info->tree_root->nodesize); + BUG_ON(!eb_rewin); + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + BUG_ON(!eb_rewin); + } + + extent_buffer_get(eb_rewin); + free_extent_buffer(eb); + + __tree_mod_log_rewind(eb_rewin, time_seq, tm); + + return eb_rewin; +} + +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct extent_buffer *eb; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + + eb = btrfs_read_lock_root_node(root); + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (!tm) + return root->node; + + if (tm->op == MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + } else { + logical = root->node->start; + } + + tm = tree_mod_log_search(root->fs_info, logical, time_seq); + if (old_root) + eb = alloc_dummy_extent_buffer(logical, root->nodesize); + else + eb = btrfs_clone_extent_buffer(root->node); + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + if (!eb) + return NULL; + btrfs_tree_read_lock(eb); + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, root->root_key.objectid); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + if (tm) + __tree_mod_log_rewind(eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + extent_buffer_get(eb); + + return eb; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -700,7 +1410,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen); + uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { @@ -714,7 +1424,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) return -EIO; } else if (!uptodate) { - btrfs_read_buffer(cur, gen); + err = btrfs_read_buffer(cur, gen); + if (err) { + free_extent_buffer(cur); + return err; + } } } if (search_start == 0) @@ -829,20 +1543,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb, static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot) { - if (level == 0) { + if (level == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), key, btrfs_header_nritems(eb), slot); - } else { + else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), sizeof(struct btrfs_key_ptr), key, btrfs_header_nritems(eb), slot); - } - return -1; } int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, @@ -934,7 +1646,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - BUG_ON(!child); + if (!child) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } + btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); @@ -944,6 +1661,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -959,15 +1677,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ - free_extent_buffer(mid); + free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - btrfs_header_nritems(mid); - left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); @@ -997,7 +1713,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - btrfs_header_nritems(mid); } /* @@ -1010,17 +1725,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - wret = del_ptr(trans, root, path, level + 1, pslot + - 1); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot + 1, 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); - free_extent_buffer(right); + free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &right_key, pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1035,7 +1749,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - BUG_ON(!left); + if (!left) { + ret = -EROFS; + btrfs_std_error(root->fs_info, ret); + goto enospc; + } wret = balance_node_right(trans, root, mid, left); if (wret < 0) { ret = wret; @@ -1051,17 +1769,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - wret = del_ptr(trans, root, path, level + 1, pslot); - if (wret) - ret = wret; + del_ptr(trans, root, path, level + 1, pslot, 1); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); - free_extent_buffer(mid); + free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1159,6 +1877,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1210,6 +1930,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1331,7 +2053,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); eb = btrfs_find_tree_block(root, block1, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + /* + * if we get -eagain from btrfs_buffer_uptodate, we + * don't want to return eagain here. That will loop + * forever + */ + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block1 = 0; free_extent_buffer(eb); } @@ -1339,7 +2066,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen)) + if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } @@ -1382,7 +2109,8 @@ static noinline int reada_for_balance(struct btrfs_root *root, * if lowest_unlock is 1, level 0 won't be unlocked */ static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock) + int lowest_unlock, int min_write_lock_level, + int *write_lock_level) { int i; int skip_level = level; @@ -1414,6 +2142,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level, if (i >= lowest_unlock && i > skip_level && path->locks[i]) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; + if (write_lock_level && + i > min_write_lock_level && + i <= *write_lock_level) { + *write_lock_level = i - 1; + } } } } @@ -1456,7 +2189,7 @@ static int read_block_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key) + struct btrfs_key *key, u64 time_seq) { u64 blocknr; u64 gen; @@ -1471,8 +2204,9 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { - if (btrfs_buffer_uptodate(tmp, 0)) { - if (btrfs_buffer_uptodate(tmp, gen)) { + /* first we do an atomic uptodate check */ + if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* * we found an up to date block without * sleeping, return @@ -1490,8 +2224,9 @@ read_block_for_search(struct btrfs_trans_handle *trans, free_extent_buffer(tmp); btrfs_set_path_blocking(p); + /* now we're allowed to do a blocking uptodate check */ tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { *eb_ret = tmp; return 0; } @@ -1526,7 +2261,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!btrfs_buffer_uptodate(tmp, 0)) + if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); } @@ -1637,6 +2372,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; + int min_write_lock_level; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1664,6 +2400,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; + min_write_lock_level = write_lock_level; + again: /* * we try very hard to do read locks on the root @@ -1795,7 +2533,8 @@ cow_done: goto again; } - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); if (level == lowest_level) { if (dec) @@ -1804,7 +2543,7 @@ cow_done: } err = read_block_for_search(trans, root, p, - &b, level, slot, key); + &b, level, slot, key, 0); if (err == -EAGAIN) goto again; if (err) { @@ -1857,7 +2596,8 @@ cow_done: } } if (!p->search_for_split) - unlock_up(p, level, lowest_unlock); + unlock_up(p, level, lowest_unlock, + min_write_lock_level, &write_lock_level); goto done; } } @@ -1875,21 +2615,125 @@ done: } /* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + b = get_old_root(root, time_seq); + level = btrfs_header_level(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + level = btrfs_header_level(b); + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + ret = bin_search(b, key, level, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(NULL, root, p, &b, level, + slot, key, time_seq); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + b = tree_mod_log_rewind(root->fs_info, b, time_seq); + if (b != p->nodes[level]) { + btrfs_tree_unlock_rw(p->nodes[level], + p->locks[level]); + p->locks[level] = 0; + p->nodes[level] = b; + } + } else { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + } + ret = 1; +done: + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops * fixing up pointers when a given leaf/node is not in slot 0 of the * higher levels * - * If this fails to write a tree block, it returns -1, but continues - * fixing up the blocks in ram so the tree is consistent. */ -static int fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) { int i; - int ret = 0; struct extent_buffer *t; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1897,12 +2741,12 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; + tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) break; } - return ret; } /* @@ -1911,9 +2755,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; struct extent_buffer *eb; @@ -1923,13 +2767,11 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (comp_keys(&disk_key, new_key) >= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) >= 0); } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (comp_keys(&disk_key, new_key) <= 0) - return -1; + BUG_ON(comp_keys(&disk_key, new_key) <= 0); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -1937,7 +2779,6 @@ int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(eb); if (slot == 0) fixup_low_keys(trans, root, path, &disk_key, 1); - return 0; } /* @@ -1983,12 +2824,16 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { + tree_mod_log_eb_move(root->fs_info, src, 0, push_items, + src_nritems - push_items); memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * @@ -2042,11 +2887,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); + tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2121,6 +2969,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; + tree_mod_log_set_root_pointer(root, c); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -2140,36 +2989,42 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. - * - * returns zero on success and < 0 on any error */ -static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, struct btrfs_disk_key - *key, u64 bytenr, int slot, int level) +static void insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; + int ret; BUG_ON(!path->nodes[level]); btrfs_assert_tree_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) - BUG(); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { + if (level) + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + slot, nritems - slot); memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } + if (level) { + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); - return 0; } /* @@ -2190,7 +3045,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; int mid; int ret; - int wret; u32 c_nritems; c = path->nodes[level]; @@ -2235,7 +3089,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -2247,11 +3101,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - wret = insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, - level + 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2320,6 +3171,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; + struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -2331,6 +3183,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 data_end; u32 this_item_size; + btrfs_init_map_token(&token); + if (empty) nr = 0; else @@ -2408,8 +3262,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space -= btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space -= btrfs_token_item_size(right, item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } left_nritems -= push_items; @@ -2537,9 +3391,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 old_left_nritems; u32 nr; int ret = 0; - int wret; u32 this_item_size; u32 old_left_item_size; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -2600,9 +3456,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, item = btrfs_item_nr(left, i); - ioff = btrfs_item_offset(left, item); - btrfs_set_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + ioff = btrfs_token_item_offset(left, item, &token); + btrfs_set_token_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -2632,8 +3489,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - push_space = push_space - btrfs_item_size(right, item); - btrfs_set_item_offset(right, item, push_space); + push_space = push_space - btrfs_token_item_size(right, + item, &token); + btrfs_set_token_item_offset(right, item, push_space, &token); } btrfs_mark_buffer_dirty(left); @@ -2643,9 +3501,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -2716,7 +3572,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root path->nodes[1], slot - 1, &left); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ - ret = 1; + if (ret == -ENOSPC) + ret = 1; goto out; } @@ -2738,22 +3595,21 @@ out: /* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. */ -static noinline int copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline void copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { int data_copy_size; int rt_data_off; int i; - int ret = 0; - int wret; struct btrfs_disk_key disk_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -2775,17 +3631,15 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_item_offset(right, item); - btrfs_set_item_offset(right, item, ioff + rt_data_off); + ioff = btrfs_token_item_offset(right, item, &token); + btrfs_set_token_item_offset(right, item, + ioff + rt_data_off, &token); } btrfs_set_header_nritems(l, mid); - ret = 0; btrfs_item_key(right, &disk_key, 0); - wret = insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -2803,8 +3657,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); - - return ret; } /* @@ -2993,12 +3845,8 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; - + insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3006,29 +3854,21 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, + insert_ptr(trans, root, path, &disk_key, right->start, path->slots[1], 1); - if (wret) - ret = wret; btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } + if (path->slots[1] == 0) + fixup_low_keys(trans, root, path, + &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; } - ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); - BUG_ON(ret); + copy_for_split(trans, root, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -3036,7 +3876,7 @@ again: goto again; } - return ret; + return 0; push_for_double: push_for_double_split(trans, root, path, data_size); @@ -3238,11 +4078,9 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - ret = setup_items_for_insert(trans, root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - BUG_ON(ret); - + setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -3257,10 +4095,10 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -int btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -3271,13 +4109,16 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; old_size = btrfs_item_size_nr(leaf, slot); if (old_size == new_size) - return 0; + return; nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(root, leaf); @@ -3297,8 +4138,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + size_diff); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + size_diff, &token); } /* shift the data */ @@ -3350,15 +4192,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* * make the item pointed to by the path bigger, data_size is the new size. */ -int btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) { int slot; struct extent_buffer *leaf; @@ -3368,6 +4209,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -3397,8 +4241,9 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - data_size); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - data_size, &token); } /* shift the data */ @@ -3416,7 +4261,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return 0; } /* @@ -3441,6 +4285,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, unsigned int data_end; struct btrfs_disk_key disk_key; struct btrfs_key found_key; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); for (i = 0; i < nr; i++) { if (total_size + data_size[i] + sizeof(struct btrfs_item) > @@ -3506,8 +4353,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3534,9 +4382,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); @@ -3544,7 +4393,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -3562,19 +4411,21 @@ out: * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -int setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr) { struct btrfs_item *item; int i; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; - int ret; struct extent_buffer *leaf; int slot; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; @@ -3606,8 +4457,9 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff - total_data); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff - total_data, &token); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -3626,17 +4478,17 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(leaf, slot + i); - btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + btrfs_set_token_item_offset(leaf, item, + data_end - data_size[i], &token); data_end -= data_size[i]; - btrfs_set_item_size(leaf, item, data_size[i]); + btrfs_set_token_item_size(leaf, item, data_size[i], &token); } btrfs_set_header_nritems(leaf, nritems + nr); - ret = 0; if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - ret = fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(trans, root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -3645,7 +4497,6 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_print_leaf(root, leaf); BUG(); } - return ret; } /* @@ -3672,16 +4523,14 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret == 0) return -EEXIST; if (ret < 0) - goto out; + return ret; slot = path->slots[0]; BUG_ON(slot < 0); - ret = setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(trans, root, path, cpu_key, data_size, total_data, total_size, nr); - -out: - return ret; + return 0; } /* @@ -3717,22 +4566,30 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot, + int tree_mod_log) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; - int ret = 0; - int wret; + int ret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, parent, slot, + slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); + } else if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); } + nritems--; btrfs_set_header_nritems(parent, nritems); if (nritems == 0 && parent == root->node) { @@ -3743,12 +4600,9 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); - return ret; } /* @@ -3761,17 +4615,13 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { - int ret; - WARN_ON(btrfs_header_generation(leaf) != trans->transid); - ret = del_ptr(trans, root, path, 1, path->slots[1]); - if (ret) - return ret; + del_ptr(trans, root, path, 1, path->slots[1], 1); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3781,8 +4631,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); + extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1); - return 0; + free_extent_buffer_stale(leaf); } /* * delete the item at the leaf level in path. If that empties @@ -3799,6 +4650,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; + struct btrfs_map_token token; + + btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -3820,8 +4674,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - ioff = btrfs_item_offset(leaf, item); - btrfs_set_item_offset(leaf, item, ioff + dsize); + ioff = btrfs_token_item_offset(leaf, item, &token); + btrfs_set_token_item_offset(leaf, item, + ioff + dsize, &token); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -3839,8 +4694,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { btrfs_set_path_blocking(path); clean_tree_block(trans, root, leaf); - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -3848,10 +4702,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - wret = fixup_low_keys(trans, root, path, - &disk_key, 1); - if (wret) - ret = wret; + fixup_low_keys(trans, root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -3879,9 +4730,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, leaf); - BUG_ON(ret); + btrfs_del_leaf(trans, root, path, leaf); free_extent_buffer(leaf); + ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -4029,7 +4880,7 @@ again: tmp = btrfs_find_tree_block(root, blockptr, btrfs_level_size(root, level - 1)); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { free_extent_buffer(tmp); break; } @@ -4059,18 +4910,18 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1); + unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); + BUG_ON(!cur); /* |