From 921a1650de9eed40dd64d681aba4a4d98856f289 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 01:15:31 +0400 Subject: new helper: done_path_create() releases what needs to be released after {kern,user}_path_create() Signed-off-by: Al Viro --- fs/namei.c | 24 ++++++++++++------------ fs/ocfs2/refcounttree.c | 4 +--- 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 2ccc35c4dc2..5bc6f3d1dc8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2893,6 +2893,14 @@ out: } EXPORT_SYMBOL(kern_path_create); +void done_path_create(struct path *path, struct dentry *dentry) +{ + dput(dentry); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); +} +EXPORT_SYMBOL(done_path_create); + struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) { char *tmp = getname(pathname); @@ -2989,9 +2997,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, out_drop_write: mnt_drop_write(path.mnt); out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return error; } @@ -3048,9 +3054,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) out_drop_write: mnt_drop_write(path.mnt); out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return error; } @@ -3334,9 +3338,7 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, out_drop_write: mnt_drop_write(path.mnt); out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); out_putname: putname(from); return error; @@ -3446,9 +3448,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, out_drop_write: mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f32d7cbb7a..23cf78f6850 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4477,9 +4477,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, new_dentry, preserve); mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); -- cgit v1.2.3 From 8e4bfca1d1f0de62301dd223675717e7a5f63a27 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 01:17:26 +0400 Subject: mknod: take sanity checks on mode into the very beginning Note that applying umask can't affect their results. While that affects errno in cases like mknod("/no_such_directory/a", 030000) yielding -EINVAL (due to impossible mode_t) instead of -ENOENT (due to inexistent directory), IMO that makes a lot more sense, POSIX allows to return either and any software that relies on getting -ENOENT instead of -EINVAL in that case deserves everything it gets. Signed-off-by: Al Viro --- fs/namei.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 5bc6f3d1dc8..cf362dc9d1f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2964,8 +2964,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, struct path path; int error; - if (S_ISDIR(mode)) - return -EPERM; + error = may_mknod(mode); + if (error) + return error; dentry = user_path_create(dfd, filename, &path, 0); if (IS_ERR(dentry)) @@ -2973,9 +2974,6 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = may_mknod(mode); - if (error) - goto out_dput; error = mnt_want_write(path.mnt); if (error) goto out_dput; -- cgit v1.2.3 From a8104a9fcdeb82e22d7acd55fca20746581067d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 02:25:00 +0400 Subject: pull mnt_want_write()/mnt_drop_write() into kern_path_create()/done_path_create() resp. One side effect - attempt to create a cross-device link on a read-only fs fails with EROFS instead of EXDEV now. Makes more sense, POSIX allows, etc. Signed-off-by: Al Viro --- fs/namei.c | 57 ++++++++++++++++--------------------------------- fs/ocfs2/refcounttree.c | 7 ------ 2 files changed, 18 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index cf362dc9d1f..a3fb78fd70d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2865,10 +2865,11 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); if (IS_ERR(dentry)) - goto fail; + goto unlock; + error = -EEXIST; if (dentry->d_inode) - goto eexist; + goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -2876,16 +2877,18 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!is_dir && nd.last.name[nd.last.len])) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); + error = -ENOENT; goto fail; } + error = mnt_want_write(nd.path.mnt); + if (error) + goto fail; *path = nd.path; return dentry; -eexist: - dput(dentry); - dentry = ERR_PTR(-EEXIST); fail: + dput(dentry); + dentry = ERR_PTR(error); +unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); out: path_put(&nd.path); @@ -2897,6 +2900,7 @@ void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); mutex_unlock(&path->dentry->d_inode->i_mutex); + mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); @@ -2974,12 +2978,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mknod(&path, dentry, mode, dev); if (error) - goto out_drop_write; + goto out; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); @@ -2992,11 +2993,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); break; } -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: +out: done_path_create(&path, dentry); - return error; } @@ -3042,16 +3040,9 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mkdir(&path, dentry, mode); - if (error) - goto out_drop_write; - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); return error; } @@ -3326,16 +3317,9 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, if (IS_ERR(dentry)) goto out_putname; - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_symlink(&path, dentry, from); - if (error) - goto out_drop_write; - error = vfs_symlink(path.dentry->d_inode, dentry, from); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: + if (!error) + error = vfs_symlink(path.dentry->d_inode, dentry, from); done_path_create(&path, dentry); out_putname: putname(from); @@ -3436,15 +3420,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = mnt_want_write(new_path.mnt); - if (error) - goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) - goto out_drop_write; + goto out_dput; error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); -out_drop_write: - mnt_drop_write(new_path.mnt); out_dput: done_path_create(&new_path, new_dentry); out: diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 23cf78f6850..30a055049e1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4466,16 +4466,9 @@ int ocfs2_reflink_ioctl(struct inode *inode, goto out_dput; } - error = mnt_want_write(new_path.mnt); - if (error) { - mlog_errno(error); - goto out_dput; - } - error = ocfs2_vfs_reflink(old_path.dentry, new_path.dentry->d_inode, new_dentry, preserve); - mnt_drop_write(new_path.mnt); out_dput: done_path_create(&new_path, new_dentry); out: -- cgit v1.2.3 From bc65a1215eda3e067801e0a8f3eeffb62800f355 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 12:03:41 +0400 Subject: sanitize ecryptfs_lookup() * ->lookup() never gets hit with . or .. * dentry it gets is unhashed, so unless we had gone and hashed it ourselves, there's no need to d_drop() the sucker. * wrong name printed in one of the printks (NULL, in fact) Signed-off-by: Al Viro --- fs/ecryptfs/inode.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ffa2be57804..eeb734aea5b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -381,12 +381,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; - if ((ecryptfs_dentry->d_name.len == 1 - && !strcmp(ecryptfs_dentry->d_name.name, ".")) - || (ecryptfs_dentry->d_name.len == 2 - && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { - goto out_d_drop; - } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, @@ -397,8 +391,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, - encrypted_and_encoded_name); - goto out_d_drop; + ecryptfs_dentry->d_name.name); + goto out; } if (lower_dentry->d_inode) goto interpose; @@ -415,7 +409,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " "filename; rc = [%d]\n", __func__, rc); - goto out_d_drop; + goto out; } mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, @@ -427,14 +421,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); - goto out_d_drop; + goto out; } interpose: rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, ecryptfs_dir_inode); - goto out; -out_d_drop: - d_drop(ecryptfs_dentry); out: kfree(encrypted_and_encoded_name); return ERR_PTR(rc); -- cgit v1.2.3 From 0b1d90119a479ca3b70d871da4b2ce6c4ef9eff0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 12:09:19 +0400 Subject: ecryptfs_lookup_interpose(): allocate dentry_info first less work on failure that way Signed-off-by: Al Viro --- fs/ecryptfs/inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index eeb734aea5b..c3ca12c33ca 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -318,21 +318,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, struct vfsmount *lower_mnt; int rc = 0; - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); - BUG_ON(!lower_dentry->d_count); - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - ecryptfs_set_dentry_private(dentry, dentry_info); if (!dentry_info) { printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); dput(lower_dentry); - mntput(lower_mnt); - d_drop(dentry); return -ENOMEM; } + + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + BUG_ON(!lower_dentry->d_count); + + ecryptfs_set_dentry_private(dentry, dentry_info); ecryptfs_set_dentry_lower(dentry, lower_dentry); ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); -- cgit v1.2.3 From 5c33b183a36500a5b0a3c53c11c431f0fec6efc8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 23:05:59 +0400 Subject: uninline file_free_rcu() What inline? Its only use is passing its address to call_rcu(), for fuck sake! Signed-off-by: Al Viro --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index b3fc4d67a26..b54bf7fd0b1 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) +static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); -- cgit v1.2.3 From b5bcdda32736b94a7d178d156d80a69f536ad468 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 23:28:46 +0400 Subject: take grabbing f->f_path to do_dentry_open() Signed-off-by: Al Viro --- fs/open.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 1e914b397e1..8d2c8970029 100644 --- a/fs/open.c +++ b/fs/open.c @@ -654,6 +654,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) f->f_mode = FMODE_PATH; + path_get(&f->f_path); inode = f->f_path.dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, f->f_path.mnt); @@ -739,9 +740,7 @@ int finish_open(struct file *file, struct dentry *dentry, int error; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - mntget(file->f_path.mnt); - file->f_path.dentry = dget(dentry); - + file->f_path.dentry = dentry; error = do_dentry_open(file, open, current_cred()); if (!error) *opened |= FILE_OPENED; @@ -784,7 +783,6 @@ struct file *dentry_open(const struct path *path, int flags, f->f_flags = flags; f->f_path = *path; - path_get(&f->f_path); error = do_dentry_open(f, NULL, cred); if (!error) { error = open_check_o_direct(f); -- cgit v1.2.3 From e4fad8e5d220e3dfb1050eee752ee5058f29a232 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Jul 2012 15:33:25 +0400 Subject: consolidate pipe file creation Signed-off-by: Al Viro --- fs/exec.c | 19 +++++----------- fs/pipe.c | 75 ++++++++++++++++++++++----------------------------------------- 2 files changed, 32 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index da27b91ff1e..b800fb87f6c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file) */ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { - struct file *rp, *wp; + struct file *files[2]; struct fdtable *fdt; struct coredump_params *cp = (struct coredump_params *)info->data; struct files_struct *cf = current->files; + int err = create_pipe_files(files, 0); + if (err) + return err; - wp = create_write_pipe(0); - if (IS_ERR(wp)) - return PTR_ERR(wp); - - rp = create_read_pipe(wp, 0); - if (IS_ERR(rp)) { - free_write_pipe(wp); - return PTR_ERR(rp); - } - - cp->file = wp; + cp->file = files[1]; sys_close(0); - fd_install(0, rp); + fd_install(0, files[0]); spin_lock(&cf->file_lock); fdt = files_fdtable(cf); __set_open_fd(0, fdt); diff --git a/fs/pipe.c b/fs/pipe.c index 49c1065256f..7523d9d2a99 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1016,18 +1016,16 @@ fail_inode: return NULL; } -struct file *create_write_pipe(int flags) +int create_pipe_files(struct file **res, int flags) { int err; - struct inode *inode; + struct inode *inode = get_pipe_inode(); struct file *f; struct path path; - struct qstr name = { .name = "" }; + static struct qstr name = { .name = "" }; - err = -ENFILE; - inode = get_pipe_inode(); if (!inode) - goto err; + return -ENFILE; err = -ENOMEM; path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); @@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags) f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; - f->f_mapping = inode->i_mapping; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); - f->f_version = 0; - return f; + res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); + if (!res[0]) + goto err_file; + + path_get(&path); + res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); + res[1] = f; + return 0; - err_dentry: +err_file: + put_filp(f); +err_dentry: free_pipe_info(inode); path_put(&path); - return ERR_PTR(err); + return err; - err_inode: +err_inode: free_pipe_info(inode); iput(inode); - err: - return ERR_PTR(err); -} - -void free_write_pipe(struct file *f) -{ - free_pipe_info(f->f_dentry->d_inode); - path_put(&f->f_path); - put_filp(f); -} - -struct file *create_read_pipe(struct file *wrf, int flags) -{ - /* Grab pipe from the writer */ - struct file *f = alloc_file(&wrf->f_path, FMODE_READ, - &read_pipefifo_fops); - if (!f) - return ERR_PTR(-ENFILE); - - path_get(&wrf->f_path); - f->f_flags = O_RDONLY | (flags & O_NONBLOCK); - - return f; + return err; } int do_pipe_flags(int *fd, int flags) { - struct file *fw, *fr; + struct file *files[2]; int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; - fw = create_write_pipe(flags); - if (IS_ERR(fw)) - return PTR_ERR(fw); - fr = create_read_pipe(fw, flags); - error = PTR_ERR(fr); - if (IS_ERR(fr)) - goto err_write_pipe; + error = create_pipe_files(files, flags); + if (error) + return error; error = get_unused_fd_flags(flags); if (error < 0) @@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags) fdw = error; audit_fd_pair(fdr, fdw); - fd_install(fdr, fr); - fd_install(fdw, fw); + fd_install(fdr, files[0]); + fd_install(fdw, files[1]); fd[0] = fdr; fd[1] = fdw; @@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags) err_fdr: put_unused_fd(fdr); err_read_pipe: - path_put(&fr->f_path); - put_filp(fr); - err_write_pipe: - free_write_pipe(fw); + fput(files[0]); + fput(files[1]); return error; } -- cgit v1.2.3 From 3134f37e931d75931bdf6d4eacd82a3fd26eca7c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 25 Jul 2012 10:19:47 -0400 Subject: vfs: don't let do_last pass negative dentry to audit_inode I can reliably reproduce the following panic by simply setting an audit rule on a recent 3.5.0+ kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 IP: [] audit_copy_inode+0x10/0x90 PGD 7acd9067 PUD 7b8fb067 PMD 0 Oops: 0000 [#86] SMP Modules linked in: nfs nfs_acl auth_rpcgss fscache lockd sunrpc tpm_bios btrfs zlib_deflate libcrc32c kvm_amd kvm joydev virtio_net pcspkr i2c_piix4 floppy virtio_balloon microcode virtio_blk cirrus drm_kms_helper ttm drm i2c_core [last unloaded: scsi_wait_scan] CPU 0 Pid: 1286, comm: abrt-dump-oops Tainted: G D 3.5.0+ #1 Bochs Bochs RIP: 0010:[] [] audit_copy_inode+0x10/0x90 RSP: 0018:ffff88007aebfc38 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88003692d860 RCX: 00000000000038c4 RDX: 0000000000000000 RSI: ffff88006baf5d80 RDI: ffff88003692d860 RBP: ffff88007aebfc68 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffff880036d30f00 R14: ffff88006baf5d80 R15: ffff88003692d800 FS: 00007f7562634740(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 000000003643d000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process abrt-dump-oops (pid: 1286, threadinfo ffff88007aebe000, task ffff880079614530) Stack: ffff88007aebfdf8 ffff88007aebff28 ffff88007aebfc98 ffffffff81211358 ffff88003692d860 0000000000000000 ffff88007aebfcc8 ffffffff810d4968 ffff88007aebfcc8 ffff8800000038c4 0000000000000000 0000000000000000 Call Trace: [] ? ext4_lookup+0xe8/0x160 [] __audit_inode+0x118/0x2d0 [] do_last+0x999/0xe80 [] ? inode_permission+0x18/0x50 [] ? kmem_cache_alloc_trace+0x11a/0x130 [] path_openat+0xba/0x420 [] do_filp_open+0x41/0xa0 [] ? alloc_fd+0x4d/0x120 [] do_sys_open+0xed/0x1c0 [] ? __audit_syscall_entry+0xcc/0x300 [] sys_open+0x21/0x30 [] system_call_fastpath+0x16/0x1b RSP CR2: 0000000000000040 The problem is that do_last is passing a negative dentry to audit_inode. The comments on lookup_open note that it can pass back a negative dentry if O_CREAT is not set. This patch fixes the oops, but I'm not clear on whether there's a better approach. Cc: Miklos Szeredi Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/namei.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index a3fb78fd70d..afa087649dd 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2608,9 +2608,10 @@ retry_lookup: } /* - * It already exists. + * create/update audit record if it already exists. */ - audit_inode(pathname, path->dentry); + if (path->dentry->d_inode) + audit_inode(pathname, path->dentry); /* * If atomic_open() acquired write access it is dropped now due to -- cgit v1.2.3 From 800179c9b8a1e796e441674776d11cd4c05d61d7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jul 2012 17:29:07 -0700 Subject: fs: add link restrictions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds symlink and hardlink restrictions to the Linux VFS. Symlinks: A long-standing class of security issues is the symlink-based time-of-check-time-of-use race, most commonly seen in world-writable directories like /tmp. The common method of exploitation of this flaw is to cross privilege boundaries when following a given symlink (i.e. a root process follows a symlink belonging to another user). For a likely incomplete list of hundreds of examples across the years, please see: http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp The solution is to permit symlinks to only be followed when outside a sticky world-writable directory, or when the uid of the symlink and follower match, or when the directory owner matches the symlink's owner. Some pointers to the history of earlier discussion that I could find: 1996 Aug, Zygo Blaxell http://marc.info/?l=bugtraq&m=87602167419830&w=2 1996 Oct, Andrew Tridgell http://lkml.indiana.edu/hypermail/linux/kernel/9610.2/0086.html 1997 Dec, Albert D Cahalan http://lkml.org/lkml/1997/12/16/4 2005 Feb, Lorenzo Hernández García-Hierro http://lkml.indiana.edu/hypermail/linux/kernel/0502.0/1896.html 2010 May, Kees Cook https://lkml.org/lkml/2010/5/30/144 Past objections and rebuttals could be summarized as: - Violates POSIX. - POSIX didn't consider this situation and it's not useful to follow a broken specification at the cost of security. - Might break unknown applications that use this feature. - Applications that break because of the change are easy to spot and fix. Applications that are vulnerable to symlink ToCToU by not having the change aren't. Additionally, no applications have yet been found that rely on this behavior. - Applications should just use mkstemp() or O_CREATE|O_EXCL. - True, but applications are not perfect, and new software is written all the time that makes these mistakes; blocking this flaw at the kernel is a single solution to the entire class of vulnerability. - This should live in the core VFS. - This should live in an LSM. (https://lkml.org/lkml/2010/5/31/135) - This should live in an LSM. - This should live in the core VFS. (https://lkml.org/lkml/2010/8/2/188) Hardlinks: On systems that have user-writable directories on the same partition as system files, a long-standing class of security issues is the hardlink-based time-of-check-time-of-use race, most commonly seen in world-writable directories like /tmp. The common method of exploitation of this flaw is to cross privilege boundaries when following a given hardlink (i.e. a root process follows a hardlink created by another user). Additionally, an issue exists where users can "pin" a potentially vulnerable setuid/setgid file so that an administrator will not actually upgrade a system fully. The solution is to permit hardlinks to only be created when the user is already the existing file's owner, or if they already have read/write access to the existing file. Many Linux users are surprised when they learn they can link to files they have no access to, so this change appears to follow the doctrine of "least surprise". Additionally, this change does not violate POSIX, which states "the implementation may require that the calling process has permission to access the existing file"[1]. This change is known to break some implementations of the "at" daemon, though the version used by Fedora and Ubuntu has been fixed[2] for a while. Otherwise, the change has been undisruptive while in use in Ubuntu for the last 1.5 years. [1] http://pubs.opengroup.org/onlinepubs/9699919799/functions/linkat.html [2] http://anonscm.debian.org/gitweb/?p=collab-maint/at.git;a=commitdiff;h=f4114656c3a6c6f6070e315ffdf940a49eda3279 This patch is based on the patches in Openwall and grsecurity, along with suggestions from Al Viro. I have added a sysctl to enable the protected behavior, and documentation. Signed-off-by: Kees Cook Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/namei.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index afa087649dd..3861d85f848 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -650,6 +650,119 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki path_put(link); } +int sysctl_protected_symlinks __read_mostly = 1; +int sysctl_protected_hardlinks __read_mostly = 1; + +/** + * may_follow_link - Check symlink following for unsafe situations + * @link: The path of the symlink + * + * In the case of the sysctl_protected_symlinks sysctl being enabled, + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is + * in a sticky world-writable directory. This is to protect privileged + * processes from failing races against path names that may change out + * from under them by way of other users creating malicious symlinks. + * It will permit symlinks to be followed only when outside a sticky + * world-writable directory, or when the uid of the symlink and follower + * match, or when the directory owner matches the symlink's owner. + * + * Returns 0 if following the symlink is allowed, -ve on error. + */ +static inline int may_follow_link(struct path *link, struct nameidata *nd) +{ + const struct inode *inode; + const struct inode *parent; + + if (!sysctl_protected_symlinks) + return 0; + + /* Allowed if owner and follower match. */ + inode = link->dentry->d_inode; + if (current_cred()->fsuid == inode->i_uid) + return 0; + + /* Allowed if parent directory not sticky and world-writable. */ + parent = nd->path.dentry->d_inode; + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + return 0; + + /* Allowed if parent directory and link owner match. */ + if (parent->i_uid == inode->i_uid) + return 0; + + path_put_conditional(link, nd); + path_put(&nd->path); + return -EACCES; +} + +/** + * safe_hardlink_source - Check for safe hardlink conditions + * @inode: the source inode to hardlink from + * + * Return false if at least one of the following conditions: + * - inode is not a regular file + * - inode is setuid + * - inode is setgid and group-exec + * - access failure for read and write + * + * Otherwise returns true. + */ +static bool safe_hardlink_source(struct inode *inode) +{ + umode_t mode = inode->i_mode; + + /* Special files should not get pinned to the filesystem. */ + if (!S_ISREG(mode)) + return false; + + /* Setuid files should not get pinned to the filesystem. */ + if (mode & S_ISUID) + return false; + + /* Executable setgid files should not get pinned to the filesystem. */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + return false; + + /* Hardlinking to unreadable or unwritable sources is dangerous. */ + if (inode_permission(inode, MAY_READ | MAY_WRITE)) + return false; + + return true; +} + +/** + * may_linkat - Check permissions for creating a hardlink + * @link: the source to hardlink from + * + * Block hardlink when all of: + * - sysctl_protected_hardlinks enabled + * - fsuid does not match inode + * - hardlink source is unsafe (see safe_hardlink_source() above) + * - not CAP_FOWNER + * + * Returns 0 if successful, -ve on error. + */ +static int may_linkat(struct path *link) +{ + const struct cred *cred; + struct inode *inode; + + if (!sysctl_protected_hardlinks) + return 0; + + cred = current_cred(); + inode = link->dentry->d_inode; + + /* Source inode owner (or CAP_FOWNER) can hardlink all they like, + * otherwise, it must be a safe source. + */ + if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || + capable(CAP_FOWNER)) + return 0; + + return -EPERM; +} + static __always_inline int follow_link(struct path *link, struct nameidata *nd, void **p) { @@ -1818,6 +1931,9 @@ static int path_lookupat(int dfd, const char *name, while (err > 0) { void *cookie; struct path link = path; + err = may_follow_link(&link, nd); + if (unlikely(err)) + break; nd->flags |= LOOKUP_PARENT; err = follow_link(&link, nd, &cookie); if (err) @@ -2778,6 +2894,9 @@ static struct file *path_openat(int dfd, const char *pathname, error = -ELOOP; break; } + error = may_follow_link(&link, nd); + if (unlikely(error)) + break; nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = follow_link(&link, nd, &cookie); @@ -3421,6 +3540,9 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; + error = may_linkat(&old_path); + if (unlikely(error)) + goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; -- cgit v1.2.3 From a51d9eaa41866ab6b4b6ecad7b621f8b66ece0dc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Jul 2012 17:29:08 -0700 Subject: fs: add link restriction audit reporting Adds audit messages for unexpected link restriction violations so that system owners will have some sort of potentially actionable information about misbehaving processes. Signed-off-by: Kees Cook Signed-off-by: Al Viro --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3861d85f848..618d3531cf9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -692,6 +692,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) path_put_conditional(link, nd); path_put(&nd->path); + audit_log_link_denied("follow_link", link); return -EACCES; } @@ -760,6 +761,7 @@ static int may_linkat(struct path *link) capable(CAP_FOWNER)) return 0; + audit_log_link_denied("linkat", link); return -EPERM; } -- cgit v1.2.3 From 446945ab9a82515af4b099107eda27050e077c58 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 Jul 2012 00:39:50 +0400 Subject: lockd: shift grabbing a reference to nlm_host into nlm_alloc_call() It's used both for client and server hosts; we can't do nlmclnt_release_host() on failure exits, since the host might need nlmsvc_release_host(), with BUG_ON() for calling the wrong one. Makes life simpler for callers, actually... Signed-off-by: Al Viro --- fs/lockd/clntproc.c | 9 ++------- fs/lockd/svc4proc.c | 1 + fs/lockd/svclock.c | 1 - fs/lockd/svcproc.c | 1 + 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 8392cb85bd5..27c74f32671 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -156,7 +156,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) struct nlm_rqst *call; int status; - nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; @@ -185,9 +184,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc); /* * Allocate an NLM RPC call struct - * - * Note: the caller must hold a reference to host. In case of failure, - * this reference will be released. */ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) { @@ -199,7 +195,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) atomic_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); - call->a_host = host; + call->a_host = nlm_get_host(host); return call; } if (signalled()) @@ -207,7 +203,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) printk("nlm_alloc_call: failed, waiting for memory\n"); schedule_timeout_interruptible(5*HZ); } - nlmclnt_release_host(host); return NULL; } @@ -750,7 +745,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" " Attempting to cancel lock.\n"); - req = nlm_alloc_call(nlm_get_host(host)); + req = nlm_alloc_call(host); if (!req) return -ENOMEM; req->a_flags = RPC_TASK_ASYNC; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 9a41fdc1951..185fda89478 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -256,6 +256,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args return rpc_system_err; call = nlm_alloc_call(host); + nlmsvc_release_host(host); if (call == NULL) return rpc_system_err; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e46353f41a4..b54acaf6598 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, struct nlm_block *block; struct nlm_rqst *call = NULL; - nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return NULL; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index d27aab11f32..90cfe9a0bf5 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -294,6 +294,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args return rpc_system_err; call = nlm_alloc_call(host); + nlmsvc_release_host(host); if (call == NULL) return rpc_system_err; -- cgit v1.2.3 From bf8848918d751c1fb86f6514a75bf8d406f1c3c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Jul 2012 23:17:39 +0400 Subject: lockd: handle lockowner allocation failure in nlmclnt_proc() Signed-off-by: Al Viro --- fs/lockd/clntproc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 27c74f32671..05d29124c6a 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -161,6 +161,11 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) return -ENOMEM; nlmclnt_locks_init_private(fl, host); + if (!fl->fl_u.nfs_fl.owner) { + /* lockowner allocation has failed */ + nlmclnt_release_call(call); + return -ENOMEM; + } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); -- cgit v1.2.3 From f8310c59201b183ebee2e3fe0c7242f5729be0af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Jul 2012 11:50:30 +0400 Subject: fix O_EXCL handling for devices O_EXCL without O_CREAT has different semantics; it's "fail if already opened", not "fail if already exists". commit 71574865 broke that... Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 618d3531cf9..e133bf3bbb0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2418,7 +2418,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) mode &= ~current_umask(); - if (open_flag & O_EXCL) { + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { open_flag &= ~O_TRUNC; *opened |= FILE_CREATED; } @@ -2742,7 +2742,7 @@ retry_lookup: } error = -EEXIST; - if (open_flag & O_EXCL) + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) goto exit_dput; error = follow_managed(path, nd->flags); -- cgit v1.2.3 From 64894cf843278c7b2653a6fac2cd1a697ff930dc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 31 Jul 2012 00:53:35 +0400 Subject: simplify lookup_open()/atomic_open() - do the temporary mnt_want_write() early The write ref to vfsmount taken in lookup_open()/atomic_open() is going to be dropped; we take the one to stay in dentry_open(). Just grab the temporary in caller if it looks like we are going to need it (create/truncate/writable open) and pass (by value) "has it succeeded" flag. Instead of doing mnt_want_write() inside, check that flag and treat "false" as "mnt_want_write() has just failed". mnt_want_write() is cheap and the things get considerably simpler and more robust that way - we get it and drop it in the same function, to start with, rather than passing a "has something in the guts of really scary functions taken it" back to caller. Signed-off-by: Al Viro --- fs/namei.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index e133bf3bbb0..35291ac6f42 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2395,7 +2395,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) static int atomic_open(struct nameidata *nd, struct dentry *dentry, struct path *path, struct file *file, const struct open_flags *op, - bool *want_write, bool need_lookup, + bool got_write, bool need_lookup, int *opened) { struct inode *dir = nd->path.dentry->d_inode; @@ -2432,12 +2432,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ - if ((open_flag & (O_CREAT | O_TRUNC)) || - (open_flag & O_ACCMODE) != O_RDONLY) { - error = mnt_want_write(nd->path.mnt); - if (!error) { - *want_write = true; - } else if (!(open_flag & O_CREAT)) { + if (((open_flag & (O_CREAT | O_TRUNC)) || + (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { + if (!(open_flag & O_CREAT)) { /* * No O_CREATE -> atomicity not a requirement -> fall * back to lookup + open @@ -2445,11 +2442,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, goto no_open; } else if (open_flag & (O_EXCL | O_TRUNC)) { /* Fall back and fail with the right error */ - create_error = error; + create_error = -EROFS; goto no_open; } else { /* No side effects, safe to clear O_CREAT */ - create_error = error; + create_error = -EROFS; open_flag &= ~O_CREAT; } } @@ -2556,7 +2553,7 @@ looked_up: static int lookup_open(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, - bool *want_write, int *opened) + bool got_write, int *opened) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -2574,7 +2571,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, goto out_no_open; if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { - return atomic_open(nd, dentry, path, file, op, want_write, + return atomic_open(nd, dentry, path, file, op, got_write, need_lookup, opened); } @@ -2598,10 +2595,10 @@ static int lookup_open(struct nameidata *nd, struct path *path, * a permanent write count is taken through * the 'struct file' in finish_open(). */ - error = mnt_want_write(nd->path.mnt); - if (error) + if (!got_write) { + error = -EROFS; goto out_dput; - *want_write = true; + } *opened |= FILE_CREATED; error = security_path_mknod(&nd->path, dentry, mode, 0); if (error) @@ -2631,7 +2628,7 @@ static int do_last(struct nameidata *nd, struct path *path, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; - bool want_write = false; + bool got_write = false; int acc_mode = op->acc_mode; struct inode *inode; bool symlink_ok = false; @@ -2700,8 +2697,18 @@ static int do_last(struct nameidata *nd, struct path *path, } retry_lookup: + if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + error = mnt_want_write(nd->path.mnt); + if (!error) + got_write = true; + /* + * do _not_ fail yet - we might not need that or fail with + * a different error; let lookup_open() decide; we'll be + * dropping this one anyway. + */ + } mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, &want_write, opened); + error = lookup_open(nd, path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { @@ -2736,9 +2743,9 @@ retry_lookup: * possible mount and symlink following (this might be optimized away if * necessary...) */ - if (want_write) { + if (got_write) { mnt_drop_write(nd->path.mnt); - want_write = false; + got_write = false; } error = -EEXIST; @@ -2803,7 +2810,7 @@ finish_open: error = mnt_want_write(nd->path.mnt); if (error) goto out; - want_write = true; + got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); @@ -2830,7 +2837,7 @@ opened: goto exit_fput; } out: - if (want_write) + if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); terminate_walk(nd); @@ -2854,9 +2861,9 @@ stale_open: nd->inode = dir->d_inode; save_parent.mnt = NULL; save_parent.dentry = NULL; - if (want_write) { + if (got_write) { mnt_drop_write(nd->path.mnt); - want_write = false; + got_write = false; } retried = true; goto retry_lookup; -- cgit v1.2.3 From 5e8830dc85d0a6258132977381430b327cf553f2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:23 +0200 Subject: fs: Push file_update_time() into __block_page_mkwrite() Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/buffer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c7062c896d7..d5ec360e332 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2318,6 +2318,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, loff_t size; int ret; + /* + * Update file times before taking page lock. We may end up failing the + * fault so this update may be superfluous but who really cares... + */ + file_update_time(vma->vm_file); + lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || -- cgit v1.2.3 From 3ca9c3bd8a55956bee291cda5b224f737b0d0cfe Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:24 +0200 Subject: ceph: Push file_update_time() into ceph_page_mkwrite() CC: Sage Weil CC: ceph-devel@vger.kernel.org Acked-by: Sage Weil Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ceph/addr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b67304e4b8..452e71a1b75 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size, len; int ret; + /* Update time before taking page lock */ + file_update_time(vma->vm_file); + size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; -- cgit v1.2.3 From 120c2bcad80c0f7c6691ad70f7fb1e709854d725 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:25 +0200 Subject: 9p: Push file_update_time() into v9fs_vm_page_mkwrite() CC: Eric Van Hensbergen CC: Ron Minnich CC: Latchesar Ionkov CC: v9fs-developer@lists.sourceforge.net Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/9p/vfs_file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fc06fd27065..dd6f7ee1e31 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", page, (unsigned long)filp->private_data); + /* Update file times before taking page lock */ + file_update_time(filp); + v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ v9fs_fscache_wait_on_page_write(inode, page); -- cgit v1.2.3 From a63e9b2e76632b3ccb0f33f43484c0c64d601849 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:26 +0200 Subject: gfs2: Push file_update_time() into gfs2_page_mkwrite() CC: Steven Whitehouse CC: cluster-devel@redhat.com Acked-by: Steven Whitehouse Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/gfs2/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 31b199f6efc..07959150d44 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -376,6 +376,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) -- cgit v1.2.3 From 14ae417c6faf28b6e8ec60cc2aa0eaa19453a41c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:27 +0200 Subject: sysfs: Push file_update_time() into bin_page_mkwrite() CC: Greg Kroah-Hartman Acked-by: Greg Kroah-Hartman Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/sysfs/bin.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index a4759833d62..614b2b54488 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = 0; if (bb->vm_ops->page_mkwrite) ret = bb->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); sysfs_put_active(attr_sd); return ret; -- cgit v1.2.3 From c30dabfe5d10c5fd70d882e5afb8f59f2942b194 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:30 +0200 Subject: fs: Push mnt_want_write() outside of i_mutex Currently, mnt_want_write() is sometimes called with i_mutex held and sometimes without it. This isn't really a problem because mnt_want_write() is a non-blocking operation (essentially has a trylock semantics) but when the function starts to handle also frozen filesystems, it will get a full lock semantics and thus proper lock ordering has to be established. So move all mnt_want_write() calls outside of i_mutex. One non-trivial case needing conversion is kern_path_create() / user_path_create() which didn't include mnt_want_write() but now needs to because it acquires i_mutex. Because there are virtual file systems which don't bother with freeze / remount-ro protection we actually provide both versions of the function - one which calls mnt_want_write() and one which does not. [AV: scratch the previous, mnt_want_write() has been moved to kern_path_create() by now] Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/namei.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 35291ac6f42..1b464390dde 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2975,6 +2975,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path { struct dentry *dentry = ERR_PTR(-EEXIST); struct nameidata nd; + int err2; int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); @@ -2988,6 +2989,8 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; + /* don't fail immediately if it's r/o, at least try to report other errors */ + err2 = mnt_want_write(nd.path.mnt); /* * Do the final lookup. */ @@ -3009,9 +3012,10 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path error = -ENOENT; goto fail; } - error = mnt_want_write(nd.path.mnt); - if (error) + if (unlikely(err2)) { + error = err2; goto fail; + } *path = nd.path; return dentry; fail: @@ -3019,6 +3023,8 @@ fail: dentry = ERR_PTR(error); unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + if (!err2) + mnt_drop_write(nd.path.mnt); out: path_put(&nd.path); return dentry; @@ -3266,6 +3272,9 @@ static long do_rmdir(int dfd, const char __user *pathname) } nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); @@ -3276,19 +3285,15 @@ static long do_rmdir(int dfd, const char __user *pathname) error = -ENOENT; goto exit3; } - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit3; error = security_path_rmdir(&nd.path, dentry); if (error) - goto exit4; + goto exit3; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); -exit4: - mnt_drop_write(nd.path.mnt); exit3: dput(dentry); exit2: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); @@ -3355,6 +3360,9 @@ static long do_unlinkat(int dfd, const char __user *pathname) goto exit1; nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); @@ -3367,21 +3375,17 @@ static long do_unlinkat(int dfd, const char __user *pathname) if (!inode) goto slashes; ihold(inode); - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit2; error = security_path_unlink(&nd.path, dentry); if (error) - goto exit3; + goto exit2; error = vfs_unlink(nd.path.dentry->d_inode, dentry); -exit3: - mnt_drop_write(nd.path.mnt); - exit2: +exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); @@ -3753,6 +3757,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, if (newnd.last_type != LAST_NORM) goto exit2; + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit2; + oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; newnd.flags |= LOOKUP_RENAME_TARGET; @@ -3788,23 +3796,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, if (new_dentry == trap) goto exit5; - error = mnt_want_write(oldnd.path.mnt); - if (error) - goto exit5; error = security_path_rename(&oldnd.path, old_dentry, &newnd.path, new_dentry); if (error) - goto exit6; + goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); -exit6: - mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + mnt_drop_write(oldnd.path.mnt); exit2: path_put(&newnd.path); putname(to); -- cgit v1.2.3 From e24f17da3560781e274699f066fb788ad52f4402 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:31 +0200 Subject: fat: Push mnt_want_write() outside of i_mutex When mnt_want_write() starts to handle freezing it will get a full lock semantics requiring proper lock ordering. So push mnt_want_write() call outside of i_mutex as in other places. CC: OGAWA Hirofumi Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/fat/file.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index a71fe3715ee..e007b8bd8e5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (err) goto out; - mutex_lock(&inode->i_mutex); err = mnt_want_write_file(file); if (err) - goto out_unlock_inode; + goto out; + mutex_lock(&inode->i_mutex); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) /* The root directory has no attributes */ if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { err = -EINVAL; - goto out_drop_write; + goto out_unlock_inode; } if (sbi->options.sys_immutable && ((attr | oldattr) & ATTR_SYS) && !capable(CAP_LINUX_IMMUTABLE)) { err = -EPERM; - goto out_drop_write; + goto out_unlock_inode; } /* @@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) */ err = security_inode_setattr(file->f_path.dentry, &ia); if (err) - goto out_drop_write; + goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ err = fat_setattr(file->f_path.dentry, &ia); if (err) - goto out_drop_write; + goto out_unlock_inode; fsnotify_change(file->f_path.dentry, ia.ia_valid); if (sbi->options.sys_immutable) { @@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); -out_drop_write: - mnt_drop_write_file(file); out_unlock_inode: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); out: return err; } -- cgit v1.2.3 From e7848683ae7ded0a4a8964122a47da9104a98337 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:32 +0200 Subject: btrfs: Push mnt_want_write() outside of i_mutex When mnt_want_write() starts to handle freezing it will get a full lock semantics requiring proper lock ordering. So push mnt_want_write() call consistently outside of i_mutex. CC: Chris Mason CC: linux-btrfs@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1e9f6c019ad..cd93eb530b7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -193,6 +193,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(file); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); ip_oldflags = ip->flags; @@ -207,10 +211,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write_file(file); - if (ret) - goto out_unlock; - if (flags & FS_SYNC_FL) ip->flags |= BTRFS_INODE_SYNC; else @@ -273,9 +273,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode->i_flags = i_oldflags; } - mnt_drop_write_file(file); out_unlock: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); return ret; } @@ -641,6 +641,10 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; + error = mnt_want_write(parent->mnt); + if (error) + return error; + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, parent->dentry, namelen); @@ -652,13 +656,9 @@ static noinline int btrfs_mksubvol(struct path *parent, if (dentry->d_inode) goto out_dput; - error = mnt_want_write(parent->mnt); - if (error) - goto out_dput; - error = btrfs_may_create(dir, dentry); if (error) - goto out_drop_write; + goto out_dput; down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); @@ -676,12 +676,11 @@ static noinline int btrfs_mksubvol(struct path *parent, fsnotify_mkdir(dir, dentry); out_up_read: up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); -out_drop_write: - mnt_drop_write(parent->mnt); out_dput: dput(dentry); out_unlock: mutex_unlock(&dir->i_mutex); + mnt_drop_write(parent->mnt); return error; } -- cgit v1.2.3 From 4a55c1017b8dcfd0554734ce3f19374d5b522d59 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:33 +0200 Subject: nfsd: Push mnt_want_write() outside of i_mutex When mnt_want_write() starts to handle freezing it will get a full lock semantics requiring proper lock ordering. So push mnt_want_write() call consistently outside of i_mutex. CC: linux-nfs@vger.kernel.org CC: "J. Bruce Fields" Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/nfsd/nfs4recover.c | 9 +++--- fs/nfsd/nfsfh.c | 1 + fs/nfsd/nfsproc.c | 9 +++++- fs/nfsd/vfs.c | 79 ++++++++++++++++++++++++++------------------------- fs/nfsd/vfs.h | 11 +++++-- 5 files changed, 63 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5ff0b7b9fc0..43295d45cc2 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (status < 0) return; + status = mnt_want_write_file(rec_file); + if (status) + return; + dir = rec_file->f_path.dentry; /* lock the parent */ mutex_lock(&dir->d_inode->i_mutex); @@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = mnt_want_write_file(rec_file); - if (status) - goto out_put; status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); - mnt_drop_write_file(rec_file); out_put: dput(dentry); out_unlock: @@ -189,6 +189,7 @@ out_unlock: " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); + mnt_drop_write_file(rec_file); nfs4_reset_creds(original_cred); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index cc793005a87..032af381b3a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp) fhp->fh_post_saved = 0; #endif } + fh_drop_write(fhp); if (exp) { exp_put(exp); fhp->fh_export = NULL; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index e15dc45fc5e..aad6d457b9e 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, struct dentry *dchild; int type, mode; __be32 nfserr; + int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); dprintk("nfsd: CREATE %s %.*s\n", @@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; + hosterr = fh_want_write(dirfhp); + if (hosterr) { + nfserr = nfserrno(hosterr); + goto done; + } + fh_lock_nested(dirfhp, I_MUTEX_PARENT); dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); if (IS_ERR(dchild)) { @@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, out_unlock: /* We don't really need to unlock, as fh_put does it. */ fh_unlock(dirfhp); - + fh_drop_write(dirfhp); done: fh_put(dirfhp); return nfsd_return_dirop(nfserr, resp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4700a0a929d..dccd396a1bb 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1276,6 +1276,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * If it has, the parent directory should already be locked. */ if (!resfhp->fh_dentry) { + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ fh_lock_nested(fhp, I_MUTEX_PARENT); dchild = lookup_one_len(fname, dentry, flen); @@ -1319,14 +1323,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - /* * Get the dir op function pointer. */ err = 0; + host_err = 0; switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, true); @@ -1343,10 +1344,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; } - if (host_err < 0) { - fh_drop_write(fhp); + if (host_err < 0) goto out_nfserr; - } err = nfsd_create_setattr(rqstp, resfhp, iap); @@ -1358,7 +1357,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; - fh_drop_write(fhp); /* * Update the file handle to get the new inode info. */ @@ -1417,6 +1415,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notdir; if (!dirp->i_op->lookup) goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock_nested(fhp, I_MUTEX_PARENT); /* @@ -1449,9 +1452,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; if (dchild->d_inode) { err = 0; @@ -1522,7 +1522,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = nfserrno(commit_metadata(fhp)); - fh_drop_write(fhp); /* * Update the filehandle to get the new inode info. */ @@ -1533,6 +1532,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, fh_unlock(fhp); if (dchild && !IS_ERR(dchild)) dput(dchild); + fh_drop_write(fhp); return err; out_nfserr: @@ -1613,6 +1613,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock(fhp); dentry = fhp->fh_dentry; dnew = lookup_one_len(fname, dentry, flen); @@ -1620,10 +1625,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - if (unlikely(path[plen] != 0)) { char *path_alloced = kmalloc(plen+1, GFP_KERNEL); if (path_alloced == NULL) @@ -1683,6 +1684,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; + host_err = fh_want_write(tfhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; dirp = ddir->d_inode; @@ -1694,18 +1701,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; - host_err = fh_want_write(tfhp); - if (host_err) { - err = nfserrno(host_err); - goto out_dput; - } err = nfserr_noent; if (!dold->d_inode) - goto out_drop_write; + goto out_dput; host_err = nfsd_break_lease(dold->d_inode); if (host_err) { err = nfserrno(host_err); - goto out_drop_write; + goto out_dput; } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { @@ -1718,12 +1720,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } -out_drop_write: - fh_drop_write(tfhp); out_dput: dput(dnew); out_unlock: fh_unlock(ffhp); + fh_drop_write(tfhp); out: return err; @@ -1766,6 +1767,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; + host_err = fh_want_write(ffhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + /* cannot use fh_lock as we need deadlock protective ordering * so do it by hand */ trap = lock_rename(tdentry, fdentry); @@ -1796,17 +1803,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; - host_err = fh_want_write(ffhp); - if (host_err) - goto out_dput_new; host_err = nfsd_break_lease(odentry->d_inode); if (host_err) - goto out_drop_write; + goto out_dput_new; if (ndentry->d_inode) { host_err = nfsd_break_lease(ndentry->d_inode); if (host_err) - goto out_drop_write; + goto out_dput_new; } host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err) { @@ -1814,8 +1818,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!host_err) host_err = commit_metadata(ffhp); } -out_drop_write: - fh_drop_write(ffhp); out_dput_new: dput(ndentry); out_dput_old: @@ -1831,6 +1833,7 @@ out_drop_write: fill_post_wcc(tfhp); unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = 0; + fh_drop_write(ffhp); out: return err; @@ -1856,6 +1859,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (err) goto out; + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; dirp = dentry->d_inode; @@ -1874,21 +1881,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = fh_want_write(fhp); - if (host_err) - goto out_put; - host_err = nfsd_break_lease(rdentry->d_inode); if (host_err) - goto out_drop_write; + goto out_put; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) host_err = commit_metadata(fhp); -out_drop_write: - fh_drop_write(fhp); out_put: dput(rdentry); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ec0611b2b73..359594c393d 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); static inline int fh_want_write(struct svc_fh *fh) { - return mnt_want_write(fh->fh_export->ex_path.mnt); + int ret = mnt_want_write(fh->fh_export->ex_path.mnt); + + if (!ret) + fh->fh_want_write = 1; + return ret; } static inline void fh_drop_write(struct svc_fh *fh) { - mnt_drop_write(fh->fh_export->ex_path.mnt); + if (fh->fh_want_write) { + fh->fh_want_write = 0; + mnt_drop_write(fh->fh_export->ex_path.mnt); + } } #endif /* LINUX_NFSD_VFS_H */ -- cgit v1.2.3 From 5accdf82ba25cacefd6c1867f1704beb4d244cdd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:34 +0200 Subject: fs: Improve filesystem freezing handling vfs_check_frozen() tests are racy since the filesystem can be frozen just after the test is performed. Thus in write paths we can end up marking some pages or inodes dirty even though the file system is already frozen. This creates problems with flusher thread hanging on frozen filesystem. Another problem is that exclusion between ->page_mkwrite() and filesystem freezing has been handled by setting page dirty and then verifying s_frozen. This guaranteed that either the freezing code sees the faulted page, writes it, and writeprotects it again or we see s_frozen set and bail out of page fault. This works to protect from page being marked writeable while filesystem freezing is running but has an unpleasant artefact of leaving dirty (although unmodified and writeprotected) pages on frozen filesystem resulting in similar problems with flusher thread as the first problem. This patch aims at providing exclusion between write paths and filesystem freezing. We implement a writer-freeze read-write semaphore in the superblock. Actually, there are three such semaphores because of lock ranking reasons - one for page fault handlers (->page_mkwrite), one for all other writers, and one of internal filesystem purposes (used e.g. to track running transactions). Write paths which should block freezing (e.g. directory operations, ->aio_write(), ->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem takes the writer side. Only that we don't really want to bounce cachelines of the semaphores between CPUs for each write happening. So we implement the reader side of the semaphore as a per-cpu counter and the writer side is implemented using s_writers.frozen superblock field. [AV: microoptimize sb_start_write(); we want it fast in normal case] BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/super.c | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 230 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index c743fb3be4b..0f64ecb7b1b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,12 +33,19 @@ #include #include #include +#include #include "internal.h" LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); +static char *sb_writers_name[SB_FREEZE_LEVELS] = { + "sb_writers", + "sb_pagefaults", + "sb_internal", +}; + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. @@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return total_objects; } +static int init_sb_writers(struct super_block *s, struct file_system_type *type) +{ + int err; + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + err = percpu_counter_init(&s->s_writers.counter[i], 0); + if (err < 0) + goto err_out; + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], + &type->s_writers_key[i], 0); + } + init_waitqueue_head(&s->s_writers.wait); + init_waitqueue_head(&s->s_writers.wait_unfrozen); + return 0; +err_out: + while (--i >= 0) + percpu_counter_destroy(&s->s_writers.counter[i]); + return err; +} + +static void destroy_sb_writers(struct super_block *s) +{ + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); +} + /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to @@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (s) { if (security_sb_alloc(s)) { + /* + * We cannot call security_sb_free() without + * security_sb_alloc() succeeding. So bail out manually + */ kfree(s); s = NULL; goto out; } #ifdef CONFIG_SMP s->s_files = alloc_percpu(struct list_head); - if (!s->s_files) { - security_sb_free(s); - kfree(s); - s = NULL; - goto out; - } else { + if (!s->s_files) + goto err_out; + else { int i; for_each_possible_cpu(i) @@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) #else INIT_LIST_HEAD(&s->s_files); #endif + if (init_sb_writers(s, type)) + goto err_out; s->s_flags = flags; s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); @@ -190,6 +229,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) } out: return s; +err_out: + security_sb_free(s); +#ifdef CONFIG_SMP + if (s->s_files) + free_percpu(s->s_files); +#endif + destroy_sb_writers(s); + kfree(s); + s = NULL; + goto out; } /** @@ -203,6 +252,7 @@ static inline void destroy_super(struct super_block *s) #ifdef CONFIG_SMP free_percpu(s->s_files); #endif + destroy_sb_writers(s); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); @@ -651,10 +701,11 @@ struct super_block *get_super_thawed(struct block_device *bdev) { while (1) { struct super_block *s = get_super(bdev); - if (!s || s->s_frozen == SB_UNFROZEN) + if (!s || s->s_writers.frozen == SB_UNFROZEN) return s; up_read(&s->s_umount); - vfs_check_frozen(s, SB_FREEZE_WRITE); + wait_event(s->s_writers.wait_unfrozen, + s->s_writers.frozen == SB_UNFROZEN); put_super(s); } } @@ -732,7 +783,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) int retval; int remount_ro; - if (sb->s_frozen != SB_UNFROZEN) + if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; #ifdef CONFIG_BLOCK @@ -1163,6 +1214,120 @@ out: return ERR_PTR(error); } +/* + * This is an internal function, please use sb_end_{write,pagefault,intwrite} + * instead. + */ +void __sb_end_write(struct super_block *sb, int level) +{ + percpu_counter_dec(&sb->s_writers.counter[level-1]); + /* + * Make sure s_writers are updated before we wake up waiters in + * freeze_super(). + */ + smp_mb(); + if (waitqueue_active(&sb->s_writers.wait)) + wake_up(&sb->s_writers.wait); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); +} +EXPORT_SYMBOL(__sb_end_write); + +#ifdef CONFIG_LOCKDEP +/* + * We want lockdep to tell us about possible deadlocks with freezing but + * it's it bit tricky to properly instrument it. Getting a freeze protection + * works as getting a read lock but there are subtle problems. XFS for example + * gets freeze protection on internal level twice in some cases, which is OK + * only because we already hold a freeze protection also on higher level. Due + * to these cases we have to tell lockdep we are doing trylock when we + * already hold a freeze protection for a higher freeze level. + */ +static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, + unsigned long ip) +{ + int i; + + if (!trylock) { + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + trylock = true; + break; + } + } + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); +} +#endif + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ +retry: + if (unlikely(sb->s_writers.frozen >= level)) { + if (!wait) + return 0; + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen < level); + } + +#ifdef CONFIG_LOCKDEP + acquire_freeze_lock(sb, level, !wait, _RET_IP_); +#endif + percpu_counter_inc(&sb->s_writers.counter[level-1]); + /* + * Make sure counter is updated before we check for frozen. + * freeze_super() first sets frozen and then checks the counter. + */ + smp_mb(); + if (unlikely(sb->s_writers.frozen >= level)) { + __sb_end_write(sb, level); + goto retry; + } + return 1; +} +EXPORT_SYMBOL(__sb_start_write); + +/** + * sb_wait_write - wait until all writers to given file system finish + * @sb: the super for which we wait + * @level: type of writers we wait for (normal vs page fault) + * + * This function waits until there are no writers of given type to given file + * system. Caller of this function should make sure there can be no new writers + * of type @level before calling this function. Otherwise this function can + * livelock. + */ +static void sb_wait_write(struct super_block *sb, int level) +{ + s64 writers; + + /* + * We just cycle-through lockdep here so that it does not complain + * about returning with lock to userspace + */ + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + + do { + DEFINE_WAIT(wait); + + /* + * We use a barrier in prepare_to_wait() to separate setting + * of frozen and checking of the counter + */ + prepare_to_wait(&sb->s_writers.wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); + if (writers) + schedule(); + + finish_wait(&sb->s_writers.wait, &wait); + } while (writers); +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1170,6 +1335,31 @@ out: * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return * -EBUSY. + * + * During this function, sb->s_writers.frozen goes through these values: + * + * SB_UNFROZEN: File system is normal, all writes progress as usual. + * + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New + * writes should be blocked, though page faults are still allowed. We wait for + * all writes to complete and then proceed to the next stage. + * + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked + * but internal fs threads can still modify the filesystem (although they + * should not dirty new pages or inodes), writeback can run etc. After waiting + * for all running page faults we sync the filesystem which will clean all + * dirty pages and inodes (no new dirty pages or inodes can be created when + * sync is running). + * + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs + * modification are blocked (e.g. XFS preallocation truncation on inode + * reclaim). This is usually implemented by blocking new transactions for + * filesystems that have them and need this additional guard. After all + * internal writers are finished we call ->freeze_fs() to finish filesystem + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is + * mostly auxiliary for filesystems to verify they do not modify frozen fs. + * + * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb) { @@ -1177,7 +1367,7 @@ int freeze_super(struct super_block *sb) atomic_inc(&sb->s_active); down_write(&sb->s_umount); - if (sb->s_frozen) { + if (sb->s_writers.frozen != SB_UNFROZEN) { deactivate_locked_super(sb); return -EBUSY; } @@ -1188,33 +1378,53 @@ int freeze_super(struct super_block *sb) } if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); + /* Nothing to do really... */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } - sb->s_frozen = SB_FREEZE_WRITE; + /* From now on, no new normal writers can start */ + sb->s_writers.frozen = SB_FREEZE_WRITE; + smp_wmb(); + + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ + up_write(&sb->s_umount); + + sb_wait_write(sb, SB_FREEZE_WRITE); + + /* Now we go and block page faults... */ + down_write(&sb->s_umount); + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; smp_wmb(); + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); + + /* All writers are done so after syncing there won't be dirty data */ sync_filesystem(sb); - sb->s_frozen = SB_FREEZE_TRANS; + /* Now wait for internal filesystem counter */ + sb->s_writers.frozen = SB_FREEZE_FS; smp_wmb(); + sb_wait_write(sb, SB_FREEZE_FS); - sync_blockdev(sb->s_bdev); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } } + /* + * This is just for debugging purposes so that fs can warn if it + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } @@ -1231,7 +1441,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen == SB_UNFROZEN) { up_write(&sb->s_umount); return -EINVAL; } @@ -1244,16 +1454,15 @@ int thaw_super(struct super_block *sb) if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; up_write(&sb->s_umount); return error; } } out: - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return 0; -- cgit v1.2.3 From eb04c28288bb0098d0e75d81ba2a575239de71d8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:35 +0200 Subject: fs: Add freezing handling to mnt_want_write() / mnt_drop_write() Most of places where we want freeze protection coincides with the places where we also have remount-ro protection. So make mnt_want_write() and mnt_drop_write() (and their _file alternative) prevent freezing as well. For the few cases that are really interested only in remount-ro protection provide new function variants. BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/file_table.c | 2 +- fs/inode.c | 4 +-- fs/internal.h | 4 +++ fs/namespace.c | 97 +++++++++++++++++++++++++++++++++++++++++++++------------ fs/open.c | 2 +- 5 files changed, 85 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index b54bf7fd0b1..701985e4ccd 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file) return; if (file_check_writeable(file) != 0) return; - mnt_drop_write(mnt); + __mnt_drop_write(mnt); file_release_write(file); } diff --git a/fs/inode.c b/fs/inode.c index 775cbabd4fa..006c85ca06e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1660,11 +1660,11 @@ int file_update_time(struct file *file) return 0; /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) + if (__mnt_want_write_file(file)) return 0; ret = update_time(inode, &now, sync_it); - mnt_drop_write_file(file); + __mnt_drop_write_file(file); return ret; } diff --git a/fs/internal.h b/fs/internal.h index a6fd56c68b1..371bcc4b169 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,6 +61,10 @@ extern void __init mnt_init(void); extern struct lglock vfsmount_lock; +extern int __mnt_want_write(struct vfsmount *); +extern int __mnt_want_write_file(struct file *); +extern void __mnt_drop_write(struct vfsmount *); +extern void __mnt_drop_write_file(struct file *); /* * fs_struct.c diff --git a/fs/namespace.c b/fs/namespace.c index c53d3381b0d..4d31f73e256 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt) } /* - * Most r/o checks on a fs are for operations that take - * discrete amounts of time, like a write() or unlink(). - * We must keep track of when those operations start - * (for permission checks) and when they end, so that - * we can determine when writes are able to occur to - * a filesystem. + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. */ /** - * mnt_want_write - get write access to a mount + * __mnt_want_write - get write access to a mount without freeze protection * @m: the mount on which to take a write * - * This tells the low-level filesystem that a write is - * about to be performed to it, and makes sure that - * writes are allowed before returning success. When - * the write operation is finished, mnt_drop_write() - * must be called. This is effectively a refcount. + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. */ -int mnt_want_write(struct vfsmount *m) +int __mnt_want_write(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m) ret = -EROFS; } preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); @@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt) EXPORT_SYMBOL_GPL(mnt_clone_write); /** - * mnt_want_write_file - get write access to a file's mount + * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can + * This is like __mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already */ -int mnt_want_write_file(struct file *file) +int __mnt_want_write_file(struct file *file) { struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) - return mnt_want_write(file->f_path.mnt); + return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); } + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + sb_start_write(file->f_path.mnt->mnt_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file->f_path.mnt->mnt_sb); + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * mnt_drop_write - give up write access to a mount + * __mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * mnt_want_write() call above. + * __mnt_want_write() call above. */ -void mnt_drop_write(struct vfsmount *mnt) +void __mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} EXPORT_SYMBOL_GPL(mnt_drop_write); +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + void mnt_drop_write_file(struct file *file) { mnt_drop_write(file->f_path.mnt); diff --git a/fs/open.c b/fs/open.c index 8d2c8970029..9ddc1856550 100644 --- a/fs/open.c +++ b/fs/open.c @@ -620,7 +620,7 @@ static inline int __get_file_write_access(struct inode *inode, /* * Balanced in __fput() */ - error = mnt_want_write(mnt); + error = __mnt_want_write(mnt); if (error) put_write_access(inode); } -- cgit v1.2.3 From 5d37e9e6dec65cd21be68ee92de99686213e916b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:36 +0200 Subject: fs: Skip atime update on frozen filesystem It is unexpected to block reading of frozen filesystem because of atime update. Also handling blocking on frozen filesystem because of atime update would make locking more complex than it already is. So just skip atime update when filesystem is frozen like we skip it when filesystem is remounted read-only. BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 006c85ca06e..74d7c20fac8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1542,9 +1542,11 @@ void touch_atime(struct path *path) if (timespec_equal(&inode->i_atime, &now)) return; - if (mnt_want_write(mnt)) + if (!sb_start_write_trylock(inode->i_sb)) return; + if (__mnt_want_write(mnt)) + goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for @@ -1553,7 +1555,9 @@ void touch_atime(struct path *path) * so just ignore the return value. */ update_time(inode, &now, S_ATIME); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); +skip_update: + sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); -- cgit v1.2.3 From 14da9200140f8d722ad1767dfabadebd8b34f2ad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:37 +0200 Subject: fs: Protect write paths by sb_start_write - sb_end_write There are several entry points which dirty pages in a filesystem. mmap (handled by block_page_mkwrite()), buffered write (handled by __generic_file_aio_write()), splice write (generic_file_splice_write), truncate, and fallocate (these can dirty last partial page - handled inside each filesystem separately). Protect these places with sb_start_write() and sb_end_write(). ->page_mkwrite() calls are particularly complex since they are called with mmap_sem held and thus we cannot use standard sb_start_write() due to lock ordering constraints. We solve the problem by using a special freeze protection sb_start_pagefault() which ranks below mmap_sem. BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/buffer.c | 22 ++++------------------ fs/open.c | 7 ++++++- fs/splice.c | 3 +++ 3 files changed, 13 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index d5ec360e332..9f6d2e41281 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2306,8 +2306,8 @@ EXPORT_SYMBOL(block_commit_write); * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. * - * Direct callers of this function should call vfs_check_frozen() so that page - * fault does not busyloop until the fs is thawed. + * Direct callers of this function should protect against filesystem freezing + * using sb_start_write() - sb_end_write() functions. */ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) @@ -2345,18 +2345,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; - /* - * Freezing in progress? We check after the page is marked dirty and - * with page lock held so if the test here fails, we are sure freezing - * code will wait during syncing until the page fault is done - at that - * point page will be dirty and unlocked so freezing code will write it - * and writeprotect it again. - */ set_page_dirty(page); - if (inode->i_sb->s_frozen != SB_UNFROZEN) { - ret = -EAGAIN; - goto out_unlock; - } wait_on_page_writeback(page); return 0; out_unlock: @@ -2371,12 +2360,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, int ret; struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; - /* - * This check is racy but catches the common case. The check in - * __block_page_mkwrite() is reliable. - */ - vfs_check_frozen(sb, SB_FREEZE_WRITE); + sb_start_pagefault(sb); ret = __block_page_mkwrite(vma, vmf, get_block); + sb_end_pagefault(sb); return block_page_mkwrite_return(ret); } EXPORT_SYMBOL(block_page_mkwrite); diff --git a/fs/open.c b/fs/open.c index 9ddc1856550..f3d96e7e7b1 100644 --- a/fs/open.c +++ b/fs/open.c @@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(inode)) goto out_putf; + sb_start_write(inode->i_sb); error = locks_verify_truncate(inode, file, length); if (!error) error = security_path_truncate(&file->f_path); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); + sb_end_write(inode->i_sb); out_putf: fput(file); out: @@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!file->f_op->fallocate) return -EOPNOTSUPP; - return file->f_op->fallocate(file, mode, offset, len); + sb_start_write(inode->i_sb); + ret = file->f_op->fallocate(file, mode, offset, len); + sb_end_write(inode->i_sb); + return ret; } SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) diff --git a/fs/splice.c b/fs/splice.c index 7bf08fa22ec..41514dd8946 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; + sb_start_write(inode->i_sb); + pipe_lock(pipe); splice_from_pipe_begin(&sd); @@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } + sb_end_write(inode->i_sb); return ret; } -- cgit v1.2.3 From 8e8ad8a57c75f3bda2d03a4c4396a9a7024ad275 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:38 +0200 Subject: ext4: Convert to new freezing mechanism We remove most of frozen checks since upper layer takes care of blocking all writes. We have to handle protection in ext4_page_mkwrite() in a special way because we cannot use generic block_page_mkwrite(). Also we add a freeze protection to ext4_evict_inode() so that iput() of unlinked inode cannot modify a frozen filesystem (we cannot easily instrument ext4_journal_start() / ext4_journal_stop() with freeze protection because we are missing the superblock pointer in ext4_journal_stop() in nojournal mode). CC: linux-ext4@vger.kernel.org CC: "Theodore Ts'o" BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Acked-by: "Theodore Ts'o" Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ext4/inode.c | 15 ++++++++++----- fs/ext4/mmp.c | 6 ++++++ fs/ext4/super.c | 31 +++++++------------------------ 3 files changed, 23 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 02bc8cbe728..301e1c2db89 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; + /* + * Protect us against freezing - iput() caller didn't have to have any + * protection against it + */ + sb_start_intwrite(inode->i_sb); handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); @@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode) * cleaned up. */ ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } @@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode) stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } } @@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode) else ext4_free_inode(handle, inode); ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -4701,11 +4709,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) get_block_t *get_block; int retries = 0; - /* - * This check is racy but catches the common case. We rely on - * __block_page_mkwrite() to do a reliable check. - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -4773,5 +4777,6 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f99a1311e84..fe7c63f4717 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); mark_buffer_dirty(bh); lock_buffer(bh); @@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) get_bh(bh); submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); + sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d8759401eca..9cc9bfd5176 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -332,33 +332,17 @@ static void ext4_put_nojournal(handle_t *handle) * journal_end calls result in the superblock being marked dirty, so * that sync() will call the filesystem's write_super callback if * appropriate. - * - * To avoid j_barrier hold in userspace when a user calls freeze(), - * ext4 prevents a new handle from being started by s_frozen, which - * is in an upper layer. */ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) { journal_t *journal; - handle_t *handle; trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; - handle = ext4_journal_current_handle(); - - /* - * If a handle has been started, it should be allowed to - * finish, otherwise deadlock could happen between freeze - * and others(e.g. truncate) due to the restart of the - * journal handle if the filesystem is forzen and active - * handles are not stopped. - */ - if (!handle) - vfs_check_frozen(sb, SB_FREEZE_TRANS); - if (!journal) return ext4_get_nojournal(); /* @@ -2723,6 +2707,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; + sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2749,6 +2734,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } + sb_end_write(sb); return ret; } @@ -4302,10 +4288,8 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - vfs_check_frozen(sb, SB_FREEZE_TRANS); + if (journal) ret = ext4_journal_force_commit(journal); - } return ret; } @@ -4342,9 +4326,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean - * state independently, because ext4 prevents a new handle from being started - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from - * the upper layer. + * state independently. It relies on upper layer to stop all data & metadata + * modifications. */ static int ext4_freeze(struct super_block *sb) { @@ -4371,7 +4354,7 @@ static int ext4_freeze(struct super_block *sb) EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on s_frozen to stop further updates */ + /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return error; } -- cgit v1.2.3 From d9457dc056249913a7abe8b71dc09e427e590e35 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:39 +0200 Subject: xfs: Convert to new freezing code Generic code now blocks all writers from standard write paths. So we add blocking of all writers coming from ioctl (we get a protection of ioctl against racing remount read-only as a bonus) and convert xfs_file_aio_write() to a non-racy freeze protection. We also keep freeze protection on transaction start to block internal filesystem writes such as removal of preallocated blocks. CC: Ben Myers CC: Alex Elder CC: xfs@oss.sgi.com Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/xfs/xfs_aops.c | 18 +++++++++++++++++ fs/xfs/xfs_file.c | 10 +++++++--- fs/xfs/xfs_ioctl.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--- fs/xfs/xfs_ioctl32.c | 12 ++++++++++++ fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 3 --- fs/xfs/xfs_sync.c | 2 +- fs/xfs/xfs_trans.c | 17 +++++++++++++--- fs/xfs/xfs_trans.h | 2 ++ 10 files changed, 109 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8dad722c004..daa42383ebd 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -123,6 +123,12 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; + /* + * We will pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -199,6 +205,15 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (ioend->io_append_trans) { + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + rwsem_acquire_read( + &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -1410,6 +1425,9 @@ out_trans_cancel: if (ioend->io_append_trans) { current_set_flags_nested(&ioend->io_append_trans->t_pflags, PF_FSTRANS); + rwsem_acquire_read( + &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_trans_cancel(ioend->io_append_trans, 0); } out_destroy_ioend: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9f7ec15a652..f0081f20e5c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -781,10 +781,12 @@ xfs_file_aio_write( if (ocount == 0) return 0; - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ret = -EIO; + goto out; + } if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); @@ -803,6 +805,8 @@ xfs_file_aio_write( ret = err; } +out: + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f1535d25a9..0e0232c3b6d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -364,9 +364,15 @@ xfs_fssetdm_by_handle( if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(parfilp); + if (error) + return error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); return PTR_ERR(dentry); + } if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); @@ -382,6 +388,7 @@ xfs_fssetdm_by_handle( fsd.fsd_dmstate); out: + mnt_drop_write_file(parfilp); dput(dentry); return error; } @@ -634,7 +641,11 @@ xfs_ioc_space( if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + mnt_drop_write_file(filp); return -error; } @@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr( { struct fsxattr fa; unsigned int mask; + int error; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; @@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) mask |= FSX_NONBLOCK; - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1196,6 +1213,7 @@ xfs_ioc_setxflags( struct fsxattr fa; unsigned int flags; unsigned int mask; + int error; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -1210,7 +1228,12 @@ xfs_ioc_setxflags( mask |= FSX_NONBLOCK; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1385,8 +1408,13 @@ xfs_file_ioctl( if (copy_from_user(&dmi, arg, sizeof(dmi))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, dmi.fsd_dmstate); + mnt_drop_write_file(filp); return -error; } @@ -1434,7 +1462,11 @@ xfs_file_ioctl( if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } @@ -1463,9 +1495,14 @@ xfs_file_ioctl( if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + /* input parameter is passed in resblks field of structure */ in = inout.resblks; error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); if (error) return -error; @@ -1496,7 +1533,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1506,7 +1547,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1516,7 +1561,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4f2da0d2bf..1244274a567 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -600,7 +600,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_data_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSGROWFSRT_32: { @@ -608,7 +612,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_rt_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } #endif @@ -627,7 +635,11 @@ xfs_file_compat_ioctl( offsetof(struct xfs_swapext, sx_stat)) || xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSBULKSTAT_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index aadfce6681e..b3b9b26091a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -680,9 +680,9 @@ xfs_iomap_write_unwritten( * the same inode that we complete here and might deadlock * on the iolock. */ - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE; + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 536021fb3d4..b09a4a7eb64 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1544,7 +1544,7 @@ xfs_unmountfs( int xfs_fs_writable(xfs_mount_t *mp) { - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 90c1fc9eaea..c6bca0d92cb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -314,9 +314,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ -#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) - /* * Flags for xfs_mountfs */ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 1e9ee064dbb..0b9feacdcd1 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -394,7 +394,7 @@ xfs_sync_worker( if (!(mp->m_super->s_flags & MS_ACTIVE) && !(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdf324508c5..06ed520a767 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -576,8 +576,12 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type, KM_SLEEP); + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; } xfs_trans_t * @@ -588,6 +592,7 @@ _xfs_trans_alloc( { xfs_trans_t *tp; + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, memflags); @@ -611,6 +616,8 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); } @@ -643,7 +650,11 @@ xfs_trans_dup( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7c37b533aa8..19c17428287 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -179,6 +179,8 @@ struct xfs_log_item_desc { #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ /* * Values for call flags parameter. -- cgit v1.2.3 From fef6925cd4c6b564ecff477e07a0fca987542223 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:40 +0200 Subject: ocfs2: Convert to new freezing mechanism Protect ocfs2_page_mkwrite() and ocfs2_file_aio_write() using the new freeze protection. We also protect several ioctl entry points which were missing the protection. Finally, we add freeze protection to the journaling mechanism so that iput() of unlinked inode cannot modify a frozen filesystem. CC: Mark Fasheh CC: Joel Becker CC: ocfs2-devel@oss.oracle.com Acked-by: Joel Becker Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ocfs2/file.c | 11 +++++++++-- fs/ocfs2/ioctl.c | 14 ++++++++++++-- fs/ocfs2/journal.c | 7 ++++++- fs/ocfs2/mmap.c | 2 ++ 4 files changed, 29 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7602783d7f4..46a1f6d7510 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, { struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && !ocfs2_writes_unwritten_extents(osb)) @@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + ret = mnt_want_write_file(file); + if (ret) + return ret; + ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + mnt_drop_write_file(file); + return ret; } static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, @@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2436,6 +2442,7 @@ out_sems: ocfs2_iocb_clear_sem_locked(iocb); mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); if (written) ret = written; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d96f7f81d8d..f20edcbfe70 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(new_clusters, (int __user *)arg)) return -EFAULT; - return ocfs2_group_extend(inode, new_clusters); + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_extend(inode, new_clusters); + mnt_drop_write_file(filp); + return status; case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: if (!capable(CAP_SYS_RESOURCE)) @@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (copy_from_user(&input, (int __user *) arg, sizeof(input))) return -EFAULT; - return ocfs2_group_add(inode, &input); + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_add(inode, &input); + mnt_drop_write_file(filp); + return status; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 0a42ae96dca..2dd36af79e2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) if (journal_current_handle()) return jbd2_journal_start(journal, max_buffs); + sb_start_intwrite(osb->sb); + down_read(&osb->journal->j_trans_barrier); handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); + sb_end_intwrite(osb->sb); mlog_errno(PTR_ERR(handle)); @@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, if (ret < 0) mlog_errno(ret); - if (!nested) + if (!nested) { up_read(&journal->j_trans_barrier); + sb_end_intwrite(osb->sb); + } return ret; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9cd41083e99..d150372fd81 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sigset_t oldset; int ret; + sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); /* @@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out: ocfs2_unblock_signals(&oldset); + sb_end_pagefault(inode->i_sb); return ret; } -- cgit v1.2.3 From 39263d5e71d0fad09eab0d855a9407ad2af8378c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:41 +0200 Subject: gfs2: Convert to new freezing mechanism We update gfs2_page_mkwrite() to use new freeze protection and the transaction code to use freeze protection while the transaction is running. That is needed to stop iput() of unlinked file from modifying the filesystem. The rest is handled by the generic code. CC: cluster-devel@redhat.com CC: Steven Whitehouse Acked-by: Steven Whitehouse Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/gfs2/file.c | 15 +++------------ fs/gfs2/trans.c | 4 ++++ 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 07959150d44..8ffeb031248 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -370,11 +370,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret; - /* Wait if fs is frozen. This is racy so we check again later on - * and retry if the fs has been frozen after the page lock has - * been acquired - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); /* Update file times before taking page lock */ file_update_time(vma->vm_file); @@ -458,14 +454,9 @@ out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); - /* This check must be post dropping of transaction lock */ - if (inode->i_sb->s_frozen == SB_UNFROZEN) { - wait_on_page_writeback(page); - } else { - ret = -EAGAIN; - unlock_page(page); - } + wait_on_page_writeback(page); } + sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ad3e2fb763d..adbd27875ef 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); + sb_start_intwrite(sdp->sd_vfs); gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); error = gfs2_glock_nq(&tr->tr_t_gh); @@ -68,6 +69,7 @@ fail_gunlock: gfs2_glock_dq(&tr->tr_t_gh); fail_holder_uninit: + sb_end_intwrite(sdp->sd_vfs); gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); @@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); } + sb_end_intwrite(sdp->sd_vfs); return; } @@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); + sb_end_intwrite(sdp->sd_vfs); } /** -- cgit v1.2.3 From 58ef6a75c38e9faa7d19bb7d7b45fe0df02e8621 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:42 +0200 Subject: fuse: Convert to new freezing mechanism Convert check in fuse_file_aio_write() to using new freeze protection. CC: fuse-devel@lists.sourceforge.net CC: Miklos Szeredi Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/fuse/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b321a688cde..93d8d6c9494 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -944,9 +944,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -1004,6 +1003,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); return written ? written : err; } -- cgit v1.2.3 From fbf8fb76505a9e5bbb47e91d964105b6ea59b0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:43 +0200 Subject: ntfs: Convert to new freezing mechanism Move check in ntfs_file_aio_write_nolock() to ntfs_file_aio_write() and use new freeze protection. CC: linux-ntfs-dev@lists.sourceforge.net CC: Anton Altaparmakov Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ntfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7389d2d5e51..1ecf46448f8 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, if (err) return err; pos = *ppos; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim. */ current->backing_dev_info = mapping->backing_dev_info; written = 0; @@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } + sb_end_write(inode->i_sb); return ret; } -- cgit v1.2.3 From 2c22b337b5bbb497c41b348b2357b7070ed5ba88 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:44 +0200 Subject: nilfs2: Convert to new freezing mechanism We change nilfs_page_mkwrite() to provide proper freeze protection for writeable page faults (we must wait for frozen filesystem even if the page is fully mapped). We remove all vfs_check_frozen() checks since they are now handled by the generic code. CC: linux-nilfs@vger.kernel.org CC: KONISHI Ryusuke Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/nilfs2/file.c | 18 +++++++++++------- fs/nilfs2/ioctl.c | 2 -- fs/nilfs2/segment.c | 5 ++++- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 62cebc8e1a1..a4d56ac02e6 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_dentry->d_inode; struct nilfs_transaction_info ti; - int ret; + int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ + sb_start_pagefault(inode->i_sb); lock_page(page); if (page->mapping != inode->i_mapping || page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); - return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + ret = -EFAULT; /* make the VM retry the fault */ + goto out; } /* @@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); /* never returns -ENOMEM, but may return -ENOSPC */ if (unlikely(ret)) - return VM_FAULT_SIGBUS; + goto out; - ret = block_page_mkwrite(vma, vmf, nilfs_get_block); - if (ret != VM_FAULT_LOCKED) { + ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); + if (ret) { nilfs_transaction_abort(inode->i_sb); - return ret; + goto out; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: wait_on_page_writeback(page); - return VM_FAULT_LOCKED; + out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 06658caa18b..08f27968a7a 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, goto out_free; } - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 88e11fb346b..a5752a58993 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb, if (ret > 0) return 0; - vfs_check_frozen(sb, SB_FREEZE_WRITE); + sb_start_intwrite(sb); nilfs = sb->s_fs_info; down_read(&nilfs->ns_segctor_sem); @@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb, current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); return ret; } @@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb) err = nilfs_construct_segment(sb); if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); return err; } @@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb) current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); } void nilfs_relax_pressure_in_lock(struct super_block *sb) -- cgit v1.2.3 From b2b5ef5c8e89f19b68c174bf246f3ca212dbf0bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:45 +0200 Subject: btrfs: Convert to new freezing mechanism We convert btrfs_file_aio_write() to use new freeze check. We also add proper freeze protection to btrfs_page_mkwrite(). We also add freeze protection to the transaction mechanism to avoid starting transactions on frozen filesystem. At minimum this is necessary to stop iput() of unlinked file to change frozen filesystem during truncation. Checks in cleaner_kthread() and transaction_kthread() can be safely removed since btrfs_freeze() will lock the mutexes and thus block the threads (and they shouldn't have anything to do anyway). CC: linux-btrfs@vger.kernel.org CC: Chris Mason Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 6 +++++- fs/btrfs/transaction.c | 7 +++++++ 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2936ca49b3b..20c49b16b75 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1533,8 +1533,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { btrfs_run_delayed_iputs(root); @@ -1566,7 +1564,6 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9aa01ec2138..5caf285c6e4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t err = 0; size_t count, ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); @@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb8d671d00e..f4d901786b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6620,6 +6620,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + sb_start_pagefault(inode->i_sb); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); @@ -6709,12 +6710,15 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - if (!ret) + if (!ret) { + sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; + } unlock_page(page); out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out_noreserve: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b72b068183e..fa67ba51516 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -329,6 +329,8 @@ again: if (!h) return ERR_PTR(-ENOMEM); + sb_start_intwrite(root->fs_info->sb); + if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -339,6 +341,7 @@ again: } while (ret == -EBUSY); if (ret < 0) { + sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(ret); } @@ -528,6 +531,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } + sb_end_intwrite(root->fs_info->sb); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1517,6 +1522,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + sb_end_intwrite(root->fs_info->sb); + trace_btrfs_transaction_commit(root); btrfs_scrub_continue(root); -- cgit v1.2.3 From 1e8b212fe5dcee9d3dbb152d235f3c33458fb26e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:46 +0200 Subject: ext2: Implement freezing The only missing piece to make freezing work reliably with ext2 is to stop iput() of unlinked inode from deleting the inode on frozen filesystem. So add a necessary protection to ext2_evict_inode(). We also provide appropriate ->freeze_fs and ->unfreeze_fs functions. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ext2/inode.c | 5 ++++- fs/ext2/super.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 264d315f6c4..6363ac66faf 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode) truncate_inode_pages(&inode->i_data, 0); if (want_delete) { + sb_start_intwrite(inode->i_sb); /* set dtime */ EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); @@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode) if (unlikely(rsv)) kfree(rsv); - if (want_delete) + if (want_delete) { ext2_free_inode(inode); + sb_end_intwrite(inode->i_sb); + } } typedef struct { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 5df3d2d8169..15761e638da 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb, static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); +static int ext2_freeze(struct super_block *sb); +static int ext2_unfreeze(struct super_block *sb); void ext2_error(struct super_block *sb, const char *function, const char *fmt, ...) @@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = { .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, .sync_fs = ext2_sync_fs, + .freeze_fs = ext2_freeze, + .unfreeze_fs = ext2_unfreeze, .statfs = ext2_statfs, .remount_fs = ext2_remount, .show_options = ext2_show_options, @@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait) return 0; } +static int ext2_freeze(struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + /* + * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared + * because we have unattached inodes and thus filesystem is not fully + * consistent. + */ + if (atomic_long_read(&sb->s_remove_count)) { + ext2_sync_fs(sb, 1); + return 0; + } + /* Set EXT2_FS_VALID flag */ + spin_lock(&sbi->s_lock); + sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, sbi->s_es, 1); + + return 0; +} + +static int ext2_unfreeze(struct super_block *sb) +{ + /* Just write sb to clear EXT2_VALID_FS flag */ + ext2_write_super(sb); + + return 0; +} void ext2_write_super(struct super_block *sb) { -- cgit v1.2.3 From d9c95bdd53a8d9116d269c91ce3d151472e6bcd6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:47 +0200 Subject: fs: Remove old freezing mechanism Now that all users are converted, we can remove functions, variables, and constants defined by the old freezing mechanism. BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa Tested-by: Peter M. Petrakis Tested-by: Dann Frazier Tested-by: Massimo Morana Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 0f64ecb7b1b..a87dc1b1ac9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -217,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); init_rwsem(&s->s_dquot.dqptr_sem); - init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; -- cgit v1.2.3