From 8208a22bb8bd3c52ef634b4ff194f14892ab1713 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Jul 2011 17:32:17 -0400 Subject: switch sys_mknodat(2) to umode_t Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 5008f01787f..f6b3c73e862 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2489,7 +2489,7 @@ static int may_mknod(mode_t mode) } } -SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned, dev) { struct dentry *dentry; @@ -2536,7 +2536,7 @@ out_dput: return error; } -SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev) +SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return sys_mknodat(AT_FDCWD, filename, mode, dev); } -- cgit v1.2.3 From 18bb1db3e7607e4a997d50991a6f9fa5b0f8722c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:41:39 -0400 Subject: switch vfs_mkdir() and ->mkdir() to umode_t vfs_mkdir() gets int, but immediately drops everything that might not fit into umode_t and that's the only caller of ->mkdir()... Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index f6b3c73e862..443c703249b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2541,7 +2541,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d return sys_mknodat(AT_FDCWD, filename, mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int error = may_create(dir, dentry); -- cgit v1.2.3 From 4acdaf27ebe2034c342f3be57ef49aed1ad885ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:42:34 -0400 Subject: switch ->create() to umode_t vfs_create() ignores everything outside of 16bit subset of its mode argument; switching it to umode_t is obviously equivalent and it's the only caller of the method Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 443c703249b..05d1c2ceb13 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1976,7 +1976,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } } -int vfs_create(struct inode *dir, struct dentry *dentry, int mode, +int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int error = may_create(dir, dentry); -- cgit v1.2.3 From 1a67aafb5f72a436ca044293309fa7e6351d6a35 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:52:52 -0400 Subject: switch ->mknod() to umode_t Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 05d1c2ceb13..85bb44f222c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2444,7 +2444,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct pat } EXPORT_SYMBOL(user_path_create); -int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int error = may_create(dir, dentry); -- cgit v1.2.3 From f69aac0006c303a98da9d2db04b71fd1c600d503 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 04:31:40 -0400 Subject: switch may_mknod() to umode_t Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 85bb44f222c..e275dc36d7c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2472,7 +2472,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) return error; } -static int may_mknod(mode_t mode) +static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: -- cgit v1.2.3 From a218d0fdc5f9004164ff151d274487f6799907d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Nov 2011 14:59:34 -0500 Subject: switch open and mkdir syscalls to umode_t Signed-off-by: Al Viro --- fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e275dc36d7c..afd5876cd07 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2177,7 +2177,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Negative dentry, just create the file */ if (!dentry->d_inode) { - int mode = op->mode; + umode_t mode = op->mode; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); /* @@ -2562,7 +2562,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return error; } -SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { struct dentry *dentry; struct path path; @@ -2590,7 +2590,7 @@ out_dput: return error; } -SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) +SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return sys_mkdirat(AT_FDCWD, pathname, mode); } -- cgit v1.2.3 From c71053659e3bb27d44b79da0bb4abf5838c2060a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 18:22:03 -0500 Subject: vfs: spread struct mount - __lookup_mnt() result switch __lookup_mnt() to returning struct mount *; callers adjusted. Signed-off-by: Al Viro --- fs/namei.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 5008f01787f..d1c6a559f8f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -36,6 +36,7 @@ #include #include "internal.h" +#include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) @@ -884,7 +885,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode) { for (;;) { - struct vfsmount *mounted; + struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. @@ -898,8 +899,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) break; - path->mnt = mounted; - path->dentry = mounted->mnt_root; + path->mnt = &mounted->mnt; + path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; nd->seq = read_seqcount_begin(&path->dentry->d_seq); /* @@ -915,12 +916,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static void follow_mount_rcu(struct nameidata *nd) { while (d_mountpoint(nd->path.dentry)) { - struct vfsmount *mounted; + struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); if (!mounted) break; - nd->path.mnt = mounted; - nd->path.dentry = mounted->mnt_root; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } } -- cgit v1.2.3 From 3376f34fff5be9954fd9a9c4fd68f4a0a36d480e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 22:05:19 -0500 Subject: vfs: mnt_parent moved to struct mount the second victim... Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index d1c6a559f8f..89248bf1b90 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -680,7 +680,7 @@ static int follow_up_rcu(struct path *path) struct vfsmount *parent; struct dentry *mountpoint; - parent = path->mnt->mnt_parent; + parent = real_mount(path->mnt)->mnt_parent; if (parent == path->mnt) return 0; mountpoint = path->mnt->mnt_mountpoint; @@ -695,7 +695,7 @@ int follow_up(struct path *path) struct dentry *mountpoint; br_read_lock(vfsmount_lock); - parent = path->mnt->mnt_parent; + parent = real_mount(path->mnt)->mnt_parent; if (parent == path->mnt) { br_read_unlock(vfsmount_lock); return 0; -- cgit v1.2.3 From 0714a533805a0f8ebfc6fdb6bda9f129b8c7c6d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 22:19:58 -0500 Subject: vfs: now it can be done - make mnt_parent point to struct mount Signed-off-by: Al Viro --- fs/namei.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 89248bf1b90..2e9110a37c0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -677,36 +677,38 @@ follow_link(struct path *link, struct nameidata *nd, void **p) static int follow_up_rcu(struct path *path) { - struct vfsmount *parent; + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; struct dentry *mountpoint; - parent = real_mount(path->mnt)->mnt_parent; - if (parent == path->mnt) + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) return 0; - mountpoint = path->mnt->mnt_mountpoint; + mountpoint = mnt->mnt.mnt_mountpoint; path->dentry = mountpoint; - path->mnt = parent; + path->mnt = &parent->mnt; return 1; } int follow_up(struct path *path) { - struct vfsmount *parent; + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; struct dentry *mountpoint; br_read_lock(vfsmount_lock); - parent = real_mount(path->mnt)->mnt_parent; - if (parent == path->mnt) { + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) { br_read_unlock(vfsmount_lock); return 0; } - mntget(parent); - mountpoint = dget(path->mnt->mnt_mountpoint); + mntget(&parent->mnt); + mountpoint = dget(mnt->mnt.mnt_mountpoint); br_read_unlock(vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); - path->mnt = parent; + path->mnt = &parent->mnt; return 1; } -- cgit v1.2.3 From a73324da7af4052e1d1ddec6a5980f552420e58b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 22:25:07 -0500 Subject: vfs: move mnt_mountpoint to struct mount Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 2e9110a37c0..87363aab43f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -684,7 +684,7 @@ static int follow_up_rcu(struct path *path) parent = mnt->mnt_parent; if (&parent->mnt == path->mnt) return 0; - mountpoint = mnt->mnt.mnt_mountpoint; + mountpoint = mnt->mnt_mountpoint; path->dentry = mountpoint; path->mnt = &parent->mnt; return 1; @@ -703,7 +703,7 @@ int follow_up(struct path *path) return 0; } mntget(&parent->mnt); - mountpoint = dget(mnt->mnt.mnt_mountpoint); + mountpoint = dget(mnt->mnt_mountpoint); br_read_unlock(vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; -- cgit v1.2.3 From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: do not call audit_getname on error Just a code cleanup really. We don't need to make a function call just for it to return on error. This also makes the VFS function even easier to follow and removes a conditional on a hot path. Signed-off-by: Eric Paris --- fs/namei.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index c283a1ec008..208c6aa4a98 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page) static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - int retval = do_getname(filename, tmp); - - result = tmp; - if (retval < 0) { - if (retval == -ENOENT && empty) - *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(tmp); - result = ERR_PTR(retval); - } + char *result = __getname(); + int retval; + + if (!result) + return ERR_PTR(-ENOMEM); + + retval = do_getname(filename, result); + if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(retval); } } audit_getname(result); -- cgit v1.2.3 From e188dc02d3a9c911be56eca5aa114fe7e9822d53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 3 Feb 2012 14:25:18 +0100 Subject: vfs: fix d_inode_lookup() dentry ref leak d_inode_lookup() leaks a dentry reference on IS_DEADDIR(). Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 208c6aa4a98..a780ea515c4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1095,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr struct dentry *old; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) + if (unlikely(IS_DEADDIR(inode))) { + dput(dentry); return ERR_PTR(-ENOENT); + } old = inode->i_op->lookup(inode, dentry, nd); if (unlikely(old)) { -- cgit v1.2.3 From 630d9c47274aa89bfa77fe6556d7818bdcb12992 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 16 Nov 2011 23:57:37 -0500 Subject: fs: reduce the use of module.h wherever possible For files only using THIS_MODULE and/or EXPORT_SYMBOL, map them onto including export.h -- or if the file isn't even using those, then just delete the include. Fix up any implicit include dependencies that were being masked by module.h along the way. Signed-off-by: Paul Gortmaker --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a780ea515c4..fa549f27f01 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include -- cgit v1.2.3 From 0145acc202ca613b23b5383e55df3c32a92ad1bf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 14:32:59 -0800 Subject: vfs: uninline full_name_hash() .. and also use it in lookup_one_len() rather than open-coding it. There aren't any performance-critical users, so inlining it is silly. But it wouldn't matter if it wasn't for the fact that the word-at-a-time dentry name patches want to conditionally replace the function, and uninlining it sets the stage for that. So again, this is a preparatory patch that doesn't change any semantics, and only prepares for a much cleaner and testable word-at-a-time dentry name accessor patch. Signed-off-by: Linus Torvalds --- fs/namei.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a780ea515c4..ec72fa1acb1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,14 @@ static inline int can_lookup(struct inode *inode) return 1; } +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long hash = init_name_hash(); + while (len--) + hash = partial_name_hash(*name++, hash); + return end_name_hash(hash); +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1775,24 +1783,21 @@ static struct dentry *lookup_hash(struct nameidata *nd) struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; - unsigned long hash; unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); this.name = name; this.len = len; + this.hash = full_name_hash(name, len); if (!len) return ERR_PTR(-EACCES); - hash = init_name_hash(); while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return ERR_PTR(-EACCES); - hash = partial_name_hash(c, hash); } - this.hash = end_name_hash(hash); /* * See if the low-level filesystem might want * to use its own hash.. -- cgit v1.2.3 From 200e9ef7ab51f3dce4f35f90ea458cf43ea83bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 14:49:24 -0800 Subject: vfs: split up name hashing in link_path_walk() into helper function The code in link_path_walk() that finds out the length and the hash of the next path component is some of the hottest code in the kernel. And I have a version of it that does things at the full width of the CPU wordsize at a time, but that means that we *really* want to split it up into a separate helper function. So this re-organizes the code a bit and splits the hashing part into a helper function called "hash_name()". It returns the length of the pathname component, while at the same time computing and writing the hash to the appropriate location. The code generation is slightly changed by this patch, but generally for the better - and the added abstraction actually makes the code easier to read too. And the new interface is well suited for replacing just the "hash_name()" function with alternative implementations. Signed-off-by: Linus Torvalds --- fs/namei.c | 52 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 18 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index ec72fa1acb1..71807dc7e40 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1382,6 +1382,25 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) return end_name_hash(hash); } +/* + * We know there's a real path component here of at least + * one character. + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long hash = init_name_hash(); + unsigned long len = 0, c; + + c = (unsigned char)*name; + do { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } while (c && c != '/'); + *hashp = end_name_hash(hash); + return len; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1402,31 +1421,22 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - unsigned long hash; struct qstr this; - unsigned int c; + long len; int type; err = may_lookup(nd); if (err) break; + len = hash_name(name, &this.hash); this.name = name; - c = *(const unsigned char *)name; - - hash = init_name_hash(); - do { - name++; - hash = partial_name_hash(c, hash); - c = *(const unsigned char *)name; - } while (c && (c != '/')); - this.len = name - (const char *) this.name; - this.hash = end_name_hash(hash); + this.len = len; type = LAST_NORM; - if (this.name[0] == '.') switch (this.len) { + if (name[0] == '.') switch (len) { case 2: - if (this.name[1] == '.') { + if (name[1] == '.') { type = LAST_DOTDOT; nd->flags |= LOOKUP_JUMPED; } @@ -1445,12 +1455,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) } } - /* remove trailing slashes? */ - if (!c) + if (!name[len]) goto last_component; - while (*++name == '/'); - if (!*name) + /* + * If it wasn't NUL, we know it was '/'. Skip that + * slash, and continue until no more slashes. + */ + do { + len++; + } while (unlikely(name[len] == '/')); + if (!name[len]) goto last_component; + name += len; err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); if (err < 0) -- cgit v1.2.3 From ae942ae71934fddd0639160c24f6efa703d5784e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 19:40:57 -0800 Subject: vfs: export full_name_hash() function to modules Commit 5707c87f "vfs: uninline full_name_hash()" broke the modular build, because it needs exporting now that it isn't inlined any more. Reported-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 71807dc7e40..e2ba62820a0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1381,6 +1381,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) hash = partial_name_hash(*name++, hash); return end_name_hash(hash); } +EXPORT_SYMBOL(full_name_hash); /* * We know there's a real path component here of at least -- cgit v1.2.3 From bfcfaa77bdf0f775263e906015982a608df01c76 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 11:16:17 -0800 Subject: vfs: use 'unsigned long' accesses for dcache name comparison and hashing Ok, this is hacky, and only works on little-endian machines with goo unaligned handling. And even then only with CONFIG_DEBUG_PAGEALLOC disabled, since it can access up to 7 bytes after the pathname. But it runs like a bat out of hell. Signed-off-by: Linus Torvalds --- fs/namei.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e2ba62820a0..378497a744b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode) return 1; } +/* + * We can do the critical dentry name comparison and hashing + * operations one word at a time, but we are limited to: + * + * - Architectures with fast unaligned word accesses. We could + * do a "get_unaligned()" if this helps and is sufficiently + * fast. + * + * - Little-endian machines (so that we can generate the mask + * of low bytes efficiently). Again, we *could* do a byte + * swapping load on big-endian architectures if that is not + * expensive enough to make the optimization worthless. + * + * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we + * do not trap on the (extremely unlikely) case of a page + * crossing operation. + * + * - Furthermore, we need an efficient 64-bit compile for the + * 64-bit case in order to generate the "number of bytes in + * the final mask". Again, that could be replaced with a + * efficient population count instruction or similar. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ + return mask*0x0001020304050608 >> 56; +} + +static inline unsigned int fold_hash(unsigned long hash) +{ + hash += hash >> (8*sizeof(int)); + return hash; +} + +#else /* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; +} + +#define fold_hash(x) (x) + +#endif + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long a, mask; + unsigned long hash = 0; + + for (;;) { + a = *(unsigned long *)name; + hash *= 9; + if (len < sizeof(unsigned long)) + break; + hash += a; + name += sizeof(unsigned long); + len -= sizeof(unsigned long); + if (!len) + goto done; + } + mask = ~(~0ul << len*8); + hash += mask & a; +done: + return fold_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +#define ONEBYTES 0x0101010101010101ul +#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful +#define HIGHBITS 0x8080808080808080ul + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ + return ((a - ONEBYTES) & ~a) & HIGHBITS; +} + +/* + * Calculate the length and hash of the path component, and + * return the length of the component; + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long a, mask, hash, len; + + hash = a = 0; + len = -sizeof(unsigned long); + do { + hash = (hash + a) * 9; + len += sizeof(unsigned long); + a = *(unsigned long *)(name+len); + /* Do we have any NUL or '/' bytes in this word? */ + mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + } while (!mask); + + /* The mask *below* the first high bit set */ + mask = (mask - 1) & ~mask; + mask >>= 7; + hash += a & mask; + *hashp = fold_hash(hash); + + return len + count_masked_bytes(mask); +} + +#else + unsigned int full_name_hash(const unsigned char *name, unsigned int len) { unsigned long hash = init_name_hash(); @@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) return len; } +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into -- cgit v1.2.3 From 097b180ca09b581ef0dc24fbcfc1b227de3875df Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 6 Mar 2012 13:56:33 +0100 Subject: vfs: fix double put after complete_walk() complete_walk() already puts nd->path, no need to do it again at cleanup time. This would result in Oopses if triggered, apparently the codepath is not too well exercised. Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e2ba62820a0..f79aef16320 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2261,7 +2261,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) - goto exit; + return ERR_PTR(error); error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; -- cgit v1.2.3 From 7f6c7e62fcc123e6bd9206da99a2163fe3facc31 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 6 Mar 2012 13:56:34 +0100 Subject: vfs: fix return value from do_last() complete_walk() returns either ECHILD or ESTALE. do_last() turns this into ECHILD unconditionally. If not in RCU mode, this error will reach userspace which is complete nonsense. Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index f79aef16320..46ea9cc1664 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2162,7 +2162,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* sayonara */ error = complete_walk(nd); if (error) - return ERR_PTR(-ECHILD); + return ERR_PTR(error); error = -ENOTDIR; if (nd->flags & LOOKUP_DIRECTORY) { -- cgit v1.2.3 From e8e3c3d66fd9d1ee2250f68d778cc48c1346d228 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:27 +0800 Subject: fs: remove the second argument of k[un]map_atomic() Acked-by: Benjamin LaHaise Signed-off-by: Cong Wang --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 46ea9cc1664..d135da74ce0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3371,9 +3371,9 @@ retry: if (err) goto fail; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memcpy(kaddr, symname, len-1); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, page, fsdata); -- cgit v1.2.3 From 8de52778798fe39660a8d6b26f290e0c93202761 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Feb 2012 12:45:27 -0500 Subject: vfs: check i_nlink limits in vfs_{mkdir,rename_dir,link} New field of struct super_block - ->s_max_links. Maximal allowed value of ->i_nlink or 0; in the latter case all checks still need to be done in ->link/->mkdir/->rename instances. Note that this limit applies both to directoris and to non-directories. Signed-off-by: Al Viro --- fs/namei.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 46ea9cc1664..a0b82762e8f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2569,6 +2569,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int error = may_create(dir, dentry); + unsigned max_links = dir->i_sb->s_max_links; if (error) return error; @@ -2581,6 +2582,9 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (error) return error; + if (max_links && dir->i_nlink >= max_links) + return -EMLINK; + error = dir->i_op->mkdir(dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); @@ -2911,6 +2915,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { struct inode *inode = old_dentry->d_inode; + unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) @@ -2941,6 +2946,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0) error = -ENOENT; + else if (max_links && inode->i_nlink >= max_links) + error = -EMLINK; else error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); @@ -3050,6 +3057,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, { int error = 0; struct inode *target = new_dentry->d_inode; + unsigned max_links = new_dir->i_sb->s_max_links; /* * If we are going to change the parent - check write permissions, @@ -3073,6 +3081,11 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; + error = -EMLINK; + if (max_links && !target && new_dir != old_dir && + new_dir->i_nlink >= max_links) + goto out; + if (target) shrink_dcache_parent(new_dentry); error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -- cgit v1.2.3 From 68ac1234fb949b66941d94dce4157742799fc581 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Mar 2012 08:21:57 -0400 Subject: switch touch_atime to struct path Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a0b82762e8f..0ccc74ee92f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -642,7 +642,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p) cond_resched(); current->total_link_count++; - touch_atime(link->mnt, dentry); + touch_atime(link); nd_set_link(nd, NULL); error = security_inode_follow_link(link->dentry, nd); -- cgit v1.2.3 From 1de5b41cd3b2474c2770b825266d372073e1b28b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 21 Mar 2012 16:33:42 -0700 Subject: fs/namei.c: fix warnings on 32-bit i386 allnoconfig: fs/namei.c: In function 'has_zero': fs/namei.c:1617: warning: integer constant is too large for 'unsigned long' type fs/namei.c:1617: warning: integer constant is too large for 'unsigned long' type fs/namei.c: In function 'hash_name': fs/namei.c:1635: warning: integer constant is too large for 'unsigned long' type There must be a tidier way of doing this. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 20a4fcf001e..561db47ae04 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1455,9 +1455,15 @@ done: } EXPORT_SYMBOL(full_name_hash); +#ifdef CONFIG_64BIT #define ONEBYTES 0x0101010101010101ul #define SLASHBYTES 0x2f2f2f2f2f2f2f2ful #define HIGHBITS 0x8080808080808080ul +#else +#define ONEBYTES 0x01010101ul +#define SLASHBYTES 0x2f2f2f2ful +#define HIGHBITS 0x80808080ul +#endif /* Return the high bit set in the first byte that is a zero */ static inline unsigned long has_zero(unsigned long a) -- cgit v1.2.3 From f132c5be05e407a99cf582347a2ae0120acd3ad7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 22 Mar 2012 21:59:52 +0000 Subject: Fix full_name_hash() behaviour when length is a multiple of 8 We want it to match what hash_name() is doing, which means extra multiply by 9 in this case... Reported-and-Tested-by: Konrad Rzeszutek Wilk Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a94a7f9a03e..bd313d680d3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1439,10 +1439,10 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) for (;;) { a = *(unsigned long *)name; - hash *= 9; if (len < sizeof(unsigned long)) break; hash += a; + hash *= 9; name += sizeof(unsigned long); len -= sizeof(unsigned long); if (!len) -- cgit v1.2.3 From 989412bbd2835f1475d1528846693eddbac744c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 22 Mar 2012 15:58:27 -0700 Subject: vfs: tidy up fs/namei.c byte-repeat word constants In commit commit 1de5b41cd3b2 ("fs/namei.c: fix warnings on 32-bit") Andrew said that there must be a tidier way of doing this. This is that tidier way. Signed-off-by: Linus Torvalds --- fs/namei.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index bd313d680d3..99a34717b2b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1455,15 +1455,10 @@ done: } EXPORT_SYMBOL(full_name_hash); -#ifdef CONFIG_64BIT -#define ONEBYTES 0x0101010101010101ul -#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful -#define HIGHBITS 0x8080808080808080ul -#else -#define ONEBYTES 0x01010101ul -#define SLASHBYTES 0x2f2f2f2ful -#define HIGHBITS 0x80808080ul -#endif +#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) +#define ONEBYTES REPEAT_BYTE(0x01) +#define SLASHBYTES REPEAT_BYTE('/') +#define HIGHBITS REPEAT_BYTE(0x80) /* Return the high bit set in the first byte that is a zero */ static inline unsigned long has_zero(unsigned long a) -- cgit v1.2.3 From f7493e5d9cc10ac97cf1f1579fdc14117460b40b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 22 Mar 2012 16:10:40 -0700 Subject: vfs: tidy up sparse warnings in fs/namei.c While doing the fs/namei.c cleanups, I ran sparse on it, and it pointed out other large integers and a couple of cases of us using '0' instead of the proper 'NULL'. Sparse still doesn't understand some of the conditional locking going on, but that's no excuse for not fixing up the trivial stuff. Signed-off-by: Linus Torvalds --- fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 99a34717b2b..73ec863a989 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -161,7 +161,7 @@ static char *getname_flags(const char __user *filename, int flags, int *empty) char *getname(const char __user * filename) { - return getname_flags(filename, 0, 0); + return getname_flags(filename, 0, NULL); } #ifdef CONFIG_AUDITSYSCALL @@ -1408,7 +1408,7 @@ static inline int can_lookup(struct inode *inode) */ static inline long count_masked_bytes(unsigned long mask) { - return mask*0x0001020304050608 >> 56; + return mask*0x0001020304050608ul >> 56; } static inline unsigned int fold_hash(unsigned long hash) @@ -1972,7 +1972,7 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { - return user_path_at_empty(dfd, name, flags, path, 0); + return user_path_at_empty(dfd, name, flags, path, NULL); } static int user_path_parent(int dfd, const char __user *path, -- cgit v1.2.3 From fa4ee159512ee39b6c65ac40db986ea7a2f7de60 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 26 Mar 2012 12:54:19 +0200 Subject: vfs: fix d_need_lookup/d_revalidate order in do_lookup Doing revalidate on a dentry which has not yet been looked up makes no sense. Move the d_need_lookup() check before d_revalidate(). Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index e615ff37e27..768f2366bdd 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1139,6 +1139,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, return -ECHILD; nd->seq = seq; + if (unlikely(d_need_lookup(dentry))) + goto unlazy; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { @@ -1147,8 +1149,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, goto unlazy; } } - if (unlikely(d_need_lookup(dentry))) - goto unlazy; path->mnt = mnt; path->dentry = dentry; if (unlikely(!__follow_mount_rcu(nd, path, inode))) -- cgit v1.2.3 From 3637c05d881b2b7bab36f339245b8963f5b29c9f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 26 Mar 2012 12:54:20 +0200 Subject: vfs: don't revalidate just looked up dentry __lookup_hash() calls ->lookup() if the dentry needs lookup and on success revalidates the dentry (all under dir->i_mutex). While this is harmless it doesn't make a lot of sense. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 768f2366bdd..82f9568d315 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1869,9 +1869,7 @@ static struct dentry *__lookup_hash(struct qstr *name, * __lookup_hash is called with the parent dir's i_mutex already * held, so we are good to go here. */ - dentry = d_inode_lookup(base, dentry, nd); - if (IS_ERR(dentry)) - return dentry; + return d_inode_lookup(base, dentry, nd); } if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { -- cgit v1.2.3 From cda309de253f338b04d15b4478e45fc3a0fcc7a3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 26 Mar 2012 12:54:21 +0200 Subject: vfs: move MAY_EXEC check from __lookup_hash() The only caller of __lookup_hash() that needs the exec permission check on parent is lookup_one_len(). All lookup_hash() callers already checked permission in LOOKUP_PARENT walk. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 82f9568d315..907e2478557 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1849,13 +1849,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { - struct inode *inode = base->d_inode; struct dentry *dentry; - int err; - - err = inode_permission(inode, MAY_EXEC); - if (err) - return ERR_PTR(err); /* * Don't bother with __d_lookup: callers are for creat as @@ -1922,6 +1916,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; unsigned int c; + int err; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); @@ -1946,6 +1941,10 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return ERR_PTR(err); } + err = inode_permission(base->d_inode, MAY_EXEC); + if (err) + return ERR_PTR(err); + return __lookup_hash(&this, base, NULL); } -- cgit v1.2.3 From 3f6c7c71a2af6eb306e16562c6ee1bfdb48015fb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:04:16 -0400 Subject: untangling do_lookup() - isolate !dentry stuff from the rest of it. Duplicate the revalidation-related parts into if (!dentry) branch. Next step will be to pull them under i_mutex. This and the next 8 commits are more or less a splitup of patch by Miklos; folks, when you are working with something that convoluted, carve your patches up into easily reviewed steps, especially when a lot of codepaths involved are rarely hit... Signed-off-by: Al Viro --- fs/namei.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 907e2478557..157f3debbf9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1194,6 +1194,21 @@ retry: status = 1; } mutex_unlock(&dir->i_mutex); + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + goto done; } if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) status = d_revalidate(dentry, nd); @@ -1209,7 +1224,7 @@ retry: goto retry; } } - +done: path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); -- cgit v1.2.3 From 37c17e1f377696c797e75c1e915e838b3e0c6120 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:08:28 -0400 Subject: untangling do_lookup() - expand the area under ->i_mutex keep holding ->i_mutex over revalidation parts Signed-off-by: Al Viro --- fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 157f3debbf9..48fc0fb8c9d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1173,6 +1173,7 @@ retry: BUG_ON(nd->inode != dir); mutex_lock(&dir->i_mutex); +l: dentry = d_lookup(parent, name); if (likely(!dentry)) { dentry = d_alloc_and_lookup(parent, name, nd); @@ -1193,11 +1194,11 @@ retry: need_reval = 0; status = 1; } - mutex_unlock(&dir->i_mutex); if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { if (status < 0) { + mutex_unlock(&dir->i_mutex); dput(dentry); return status; } @@ -1205,9 +1206,10 @@ retry: dput(dentry); dentry = NULL; need_reval = 1; - goto retry; + goto l; } } + mutex_unlock(&dir->i_mutex); goto done; } if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) -- cgit v1.2.3 From acc9cb3cd425f479d8fc4a441bff45dce23aa6dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:13:15 -0400 Subject: untangling do_lookup() - eliminate a loop. d_lookup() *will* fail after successful d_invalidate(), if we are holding i_mutex all along. IOW, we don't need to jump back to l: - we know what path will be taken there and can do that (i.e. d_alloc_and_lookup()) directly. Signed-off-by: Al Viro --- fs/namei.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 48fc0fb8c9d..9ce43a358c3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1173,7 +1173,6 @@ retry: BUG_ON(nd->inode != dir); mutex_lock(&dir->i_mutex); -l: dentry = d_lookup(parent, name); if (likely(!dentry)) { dentry = d_alloc_and_lookup(parent, name, nd); @@ -1204,9 +1203,14 @@ l: } if (!d_invalidate(dentry)) { dput(dentry); - dentry = NULL; - need_reval = 1; - goto l; + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; } } mutex_unlock(&dir->i_mutex); -- cgit v1.2.3 From 08b0ab7c20f767187ae635d51bdd9d262ebe8357 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:18:50 -0400 Subject: untangling do_lookup() - get rid of need_reval in !dentry case Everything arriving into if (!dentry) will have need_reval = 1. Indeed, the only way to get there with need_reval reset to 0 would be via if (unlikely(d_need_lookup(dentry))) goto unlazy; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { if (status != -ECHILD) need_reval = 0; goto unlazy; ... unlazy: /* no assignments to dentry */ if (dentry && unlikely(d_need_lookup(dentry))) { dput(dentry); dentry = NULL; } and if d_need_lookup() had already been false the first time around, it will remain false on the second call as well. Signed-off-by: Al Viro --- fs/namei.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 9ce43a358c3..14bb00a9fa9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1181,7 +1181,6 @@ retry: return PTR_ERR(dentry); } /* known good */ - need_reval = 0; status = 1; } else if (unlikely(d_need_lookup(dentry))) { dentry = d_inode_lookup(parent, dentry, nd); @@ -1190,10 +1189,8 @@ retry: return PTR_ERR(dentry); } /* known good */ - need_reval = 0; status = 1; - } - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + } else if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { if (status < 0) { @@ -1209,7 +1206,6 @@ retry: return PTR_ERR(dentry); } /* known good */ - need_reval = 0; status = 1; } } @@ -1226,7 +1222,6 @@ retry: if (!d_invalidate(dentry)) { dput(dentry); dentry = NULL; - need_reval = 1; goto retry; } } -- cgit v1.2.3 From d774a058d94d6b0dafada2295ec5221481b07d16 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:34:00 -0400 Subject: untangling do_lookup() - massage !dentry case towards __lookup_hash() Reorder if-else cases for starters... Signed-off-by: Al Viro --- fs/namei.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 14bb00a9fa9..5414438abff 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1174,39 +1174,34 @@ retry: mutex_lock(&dir->i_mutex); dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); - } - /* known good */ - status = 1; - } else if (unlikely(d_need_lookup(dentry))) { + if (dentry && d_need_lookup(dentry)) { dentry = d_inode_lookup(parent, dentry, nd); if (IS_ERR(dentry)) { mutex_unlock(&dir->i_mutex); return PTR_ERR(dentry); } - /* known good */ - status = 1; - } else if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) + } else if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd); - if (unlikely(status <= 0)) { - if (status < 0) { - mutex_unlock(&dir->i_mutex); - dput(dentry); - return status; - } - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = d_alloc_and_lookup(parent, name, nd); - if (IS_ERR(dentry)) { + if (unlikely(status <= 0)) { + if (status < 0) { mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); + dput(dentry); + return status; } - /* known good */ - status = 1; + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + } + } + } else if (!dentry) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); } } mutex_unlock(&dir->i_mutex); -- cgit v1.2.3 From ec335e91a4f088d8759c1311d0724e609d1c318e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:37:42 -0400 Subject: untangling do_lookup() - merge failure exits in !dentry case Signed-off-by: Al Viro --- fs/namei.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 5414438abff..a0f9a0294ff 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1176,35 +1176,28 @@ retry: dentry = d_lookup(parent, name); if (dentry && d_need_lookup(dentry)) { dentry = d_inode_lookup(parent, dentry, nd); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); - } - } else if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { + goto l; + } + if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { if (status < 0) { - mutex_unlock(&dir->i_mutex); dput(dentry); - return status; + dentry = ERR_PTR(status); + goto l; } if (!d_invalidate(dentry)) { dput(dentry); dentry = d_alloc_and_lookup(parent, name, nd); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); - } } } } else if (!dentry) { dentry = d_alloc_and_lookup(parent, name, nd); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->i_mutex); - return PTR_ERR(dentry); - } } + l: mutex_unlock(&dir->i_mutex); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); goto done; } if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) -- cgit v1.2.3 From a6ecdfcfba9392f469992dd6016ceafb3ea62123 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:39:15 -0400 Subject: untangling do_lookup() - merge d_alloc_and_lookup() callers Signed-off-by: Al Viro --- fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a0f9a0294ff..1d60fdf01b3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1188,12 +1188,12 @@ retry: } if (!d_invalidate(dentry)) { dput(dentry); - dentry = d_alloc_and_lookup(parent, name, nd); + dentry = NULL; } } - } else if (!dentry) { - dentry = d_alloc_and_lookup(parent, name, nd); } + if (!dentry) + dentry = d_alloc_and_lookup(parent, name, nd); l: mutex_unlock(&dir->i_mutex); if (IS_ERR(dentry)) -- cgit v1.2.3 From a32555466caee38faeef4e44d7878ecbff1199bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:41:51 -0400 Subject: untangling do_lookup() - switch to calling __lookup_hash() now we have __lookup_hash() open-coded if !dentry case; just call the damn thing instead... Signed-off-by: Al Viro --- fs/namei.c | 113 +++++++++++++++++++++++++------------------------------------ 1 file changed, 46 insertions(+), 67 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 1d60fdf01b3..a919affd153 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1108,6 +1108,51 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr return dentry; } +static struct dentry *__lookup_hash(struct qstr *name, + struct dentry *base, struct nameidata *nd) +{ + struct dentry *dentry; + + /* + * Don't bother with __d_lookup: callers are for creat as + * well as unlink, so a lot of the time it would cost + * a double lookup. + */ + dentry = d_lookup(base, name); + + if (dentry && d_need_lookup(dentry)) { + /* + * __lookup_hash is called with the parent dir's i_mutex already + * held, so we are good to go here. + */ + return d_inode_lookup(base, dentry, nd); + } + + if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { + int status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + /* + * The dentry failed validation. + * If d_revalidate returned 0 attempt to invalidate + * the dentry otherwise d_revalidate is asking us + * to return a fail status. + */ + if (status < 0) { + dput(dentry); + return ERR_PTR(status); + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } + } + + if (!dentry) + dentry = d_alloc_and_lookup(base, name, nd); + + return dentry; +} + /* * It's more convoluted than I'd like it to be, but... it's still fairly * small and for now I'd prefer to have fast path as straight as possible. @@ -1173,28 +1218,7 @@ retry: BUG_ON(nd->inode != dir); mutex_lock(&dir->i_mutex); - dentry = d_lookup(parent, name); - if (dentry && d_need_lookup(dentry)) { - dentry = d_inode_lookup(parent, dentry, nd); - goto l; - } - if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { - status = d_revalidate(dentry, nd); - if (unlikely(status <= 0)) { - if (status < 0) { - dput(dentry); - dentry = ERR_PTR(status); - goto l; - } - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } - } - if (!dentry) - dentry = d_alloc_and_lookup(parent, name, nd); - l: + dentry = __lookup_hash(name, parent, nd); mutex_unlock(&dir->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1850,51 +1874,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, return err; } -static struct dentry *__lookup_hash(struct qstr *name, - struct dentry *base, struct nameidata *nd) -{ - struct dentry *dentry; - - /* - * Don't bother with __d_lookup: callers are for creat as - * well as unlink, so a lot of the time it would cost - * a double lookup. - */ - dentry = d_lookup(base, name); - - if (dentry && d_need_lookup(dentry)) { - /* - * __lookup_hash is called with the parent dir's i_mutex already - * held, so we are good to go here. - */ - return d_inode_lookup(base, dentry, nd); - } - - if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { - int status = d_revalidate(dentry, nd); - if (unlikely(status <= 0)) { - /* - * The dentry failed validation. - * If d_revalidate returned 0 attempt to invalidate - * the dentry otherwise d_revalidate is asking us - * to return a fail status. - */ - if (status < 0) { - dput(dentry); - return ERR_PTR(status); - } else if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } - } - - if (!dentry) - dentry = d_alloc_and_lookup(base, name, nd); - - return dentry; -} - /* * Restricted form of lookup. Doesn't follow links, single-component only, * needs parent already locked. Doesn't follow mounts. -- cgit v1.2.3 From 81e6f520898edbda56e8680d338ace4f5694874e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Mar 2012 14:48:04 -0400 Subject: untangling do_lookup() - take __lookup_hash()-calling case out of line. Signed-off-by: Al Viro --- fs/namei.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a919affd153..5eeec562a03 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1208,22 +1208,14 @@ unlazy: dentry = __d_lookup(parent, name); } - if (dentry && unlikely(d_need_lookup(dentry))) { + if (unlikely(!dentry)) + goto need_lookup; + + if (unlikely(d_need_lookup(dentry))) { dput(dentry); - dentry = NULL; + goto need_lookup; } -retry: - if (unlikely(!dentry)) { - struct inode *dir = parent->d_inode; - BUG_ON(nd->inode != dir); - mutex_lock(&dir->i_mutex); - dentry = __lookup_hash(name, parent, nd); - mutex_unlock(&dir->i_mutex); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - goto done; - } if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { @@ -1233,8 +1225,7 @@ retry: } if (!d_invalidate(dentry)) { dput(dentry); - dentry = NULL; - goto retry; + goto need_lookup; } } done: @@ -1249,6 +1240,16 @@ done: nd->flags |= LOOKUP_JUMPED; *inode = path->dentry->d_inode; return 0; + +need_lookup: + BUG_ON(nd->inode != parent->d_inode); + + mutex_lock(&parent->d_inode->i_mutex); + dentry = __lookup_hash(name, parent, nd); + mutex_unlock(&parent->d_inode->i_mutex); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + goto done; } static inline int may_lookup(struct nameidata *nd) -- cgit v1.2.3 From bad61189780ec0592cacde01a0775cb98a30efdc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 26 Mar 2012 12:54:24 +0200 Subject: vfs: split __lookup_hash Split __lookup_hash into two component functions: lookup_dcache - tries cached lookup, returns whether real lookup is needed lookup_real - calls i_op->lookup This eliminates code duplication between d_alloc_and_lookup() and d_inode_lookup(). Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 108 +++++++++++++++++++++++++------------------------------------ 1 file changed, 44 insertions(+), 64 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 5eeec562a03..fef80bfdc7a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1054,53 +1054,65 @@ static void follow_dotdot(struct nameidata *nd) } /* - * Allocate a dentry with name and parent, and perform a parent - * directory ->lookup on it. Returns the new dentry, or ERR_PTR - * on error. parent->d_inode->i_mutex must be held. d_lookup must - * have verified that no child exists while under i_mutex. + * This looks up the name in dcache, possibly revalidates the old dentry and + * allocates a new one if not found or not valid. In the need_lookup argument + * returns whether i_op->lookup is necessary. + * + * dir->d_inode->i_mutex must be held */ -static struct dentry *d_alloc_and_lookup(struct dentry *parent, - struct qstr *name, struct nameidata *nd) +static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, + struct nameidata *nd, bool *need_lookup) { - struct inode *inode = parent->d_inode; struct dentry *dentry; - struct dentry *old; + int error; - /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) - return ERR_PTR(-ENOENT); + *need_lookup = false; + dentry = d_lookup(dir, name); + if (dentry) { + if (d_need_lookup(dentry)) { + *need_lookup = true; + } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { + error = d_revalidate(dentry, nd); + if (unlikely(error <= 0)) { + if (error < 0) { + dput(dentry); + return ERR_PTR(error); + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } + } + } - dentry = d_alloc(parent, name); - if (unlikely(!dentry)) - return ERR_PTR(-ENOMEM); + if (!dentry) { + dentry = d_alloc(dir, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); - old = inode->i_op->lookup(inode, dentry, nd); - if (unlikely(old)) { - dput(dentry); - dentry = old; + *need_lookup = true; } return dentry; } /* - * We already have a dentry, but require a lookup to be performed on the parent - * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error. - * parent->d_inode->i_mutex must be held. d_lookup must have verified that no - * child exists while under i_mutex. + * Call i_op->lookup on the dentry. The dentry must be negative but may be + * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * + * dir->d_inode->i_mutex must be held */ -static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry, - struct nameidata *nd) +static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) { - struct inode *inode = parent->d_inode; struct dentry *old; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) { + if (unlikely(IS_DEADDIR(dir))) { dput(dentry); return ERR_PTR(-ENOENT); } - old = inode->i_op->lookup(inode, dentry, nd); + old = dir->i_op->lookup(dir, dentry, nd); if (unlikely(old)) { dput(dentry); dentry = old; @@ -1111,46 +1123,14 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { + bool need_lookup; struct dentry *dentry; - /* - * Don't bother with __d_lookup: callers are for creat as - * well as unlink, so a lot of the time it would cost - * a double lookup. - */ - dentry = d_lookup(base, name); + dentry = lookup_dcache(name, base, nd, &need_lookup); + if (!need_lookup) + return dentry; - if (dentry && d_need_lookup(dentry)) { - /* - * __lookup_hash is called with the parent dir's i_mutex already - * held, so we are good to go here. - */ - return d_inode_lookup(base, dentry, nd); - } - - if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { - int status = d_revalidate(dentry, nd); - if (unlikely(status <= 0)) { - /* - * The dentry failed validation. - * If d_revalidate returned 0 attempt to invalidate - * the dentry otherwise d_revalidate is asking us - * to return a fail status. - */ - if (status < 0) { - dput(dentry); - return ERR_PTR(status); - } else if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } - } - - if (!dentry) - dentry = d_alloc_and_lookup(base, name, nd); - - return dentry; + return lookup_real(base->d_inode, dentry, nd); } /* -- cgit v1.2.3 From c0d0259481cc6ec2a38cad810055e455de35c733 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 15 Feb 2012 11:48:40 -0500 Subject: vfs: fix out-of-date dentry_unhash() comment 64252c75a2196a0cf1e0d3777143ecfe0e3ae650 "vfs: remove dget() from dentry_unhash()" changed the implementation but not the comment. Cc: Sage Weil Signed-off-by: J. Bruce Fields Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index fef80bfdc7a..1898198abc3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2710,7 +2710,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /* * The dentry_unhash() helper will try to drop the dentry early: we - * should have a usage count of 2 if we're the only user of this + * should have a usage count of 1 if we're the only user of this * dentry, and if that is true (possibly after pruning the dcache), * then we drop the dentry now. * -- cgit v1.2.3 From 975d6b3932d43b87a48d2107264ed0c9a7541d8d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Nov 2011 12:16:43 -0800 Subject: vfs: Don't allow a user namespace root to make device nodes Safely making device nodes in a container is solvable but simply having the capability in a user namespace is not sufficient to make this work. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 1898198abc3..701954d68ac 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2560,8 +2560,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(inode_userns(dir), CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) -- cgit v1.2.3 From f68e556e23d1a4176b563bcb25d8baf2c5313f91 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 6 Apr 2012 13:54:56 -0700 Subject: Make the "word-at-a-time" helper functions more commonly usable I have a new optimized x86 "strncpy_from_user()" that will use these same helper functions for all the same reasons the name lookup code uses them. This is preparation for that. This moves them into an architecture-specific header file. It's architecture-specific for two reasons: - some of the functions are likely to want architecture-specific implementations. Even if the current code happens to be "generic" in the sense that it should work on any little-endian machine, it's likely that the "multiply by a big constant and shift" implementation is less than optimal for an architecture that has a guaranteed fast bit count instruction, for example. - I expect that if architectures like sparc want to start playing around with this, we'll need to abstract out a few more details (in particular the actual unaligned accesses). So we're likely to have more architecture-specific stuff if non-x86 architectures start using this. (and if it turns out that non-x86 architectures don't start using this, then having it in an architecture-specific header is still the right thing to do, of course) Signed-off-by: Linus Torvalds --- fs/namei.c | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 1898198abc3..0062dd17eb5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1407,18 +1407,9 @@ static inline int can_lookup(struct inode *inode) */ #ifdef CONFIG_DCACHE_WORD_ACCESS -#ifdef CONFIG_64BIT +#include -/* - * Jan Achrenius on G+: microoptimized version of - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" - * that works for the bytemasks without having to - * mask them first. - */ -static inline long count_masked_bytes(unsigned long mask) -{ - return mask*0x0001020304050608ul >> 56; -} +#ifdef CONFIG_64BIT static inline unsigned int fold_hash(unsigned long hash) { @@ -1428,15 +1419,6 @@ static inline unsigned int fold_hash(unsigned long hash) #else /* 32-bit case */ -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ -static inline long count_masked_bytes(long mask) -{ - /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ - long a = (0x0ff0001+mask) >> 23; - /* Fix the 1 for 00 case */ - return a & mask; -} - #define fold_hash(x) (x) #endif @@ -1464,17 +1446,6 @@ done: } EXPORT_SYMBOL(full_name_hash); -#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) -#define ONEBYTES REPEAT_BYTE(0x01) -#define SLASHBYTES REPEAT_BYTE('/') -#define HIGHBITS REPEAT_BYTE(0x80) - -/* Return the high bit set in the first byte that is a zero */ -static inline unsigned long has_zero(unsigned long a) -{ - return ((a - ONEBYTES) & ~a) & HIGHBITS; -} - /* * Calculate the length and hash of the path component, and * return the length of the component; @@ -1490,7 +1461,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) len += sizeof(unsigned long); a = *(unsigned long *)(name+len); /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); } while (!mask); /* The mask *below* the first high bit set */ -- cgit v1.2.3 From 1a48e2ac034d47ed843081c4523b63c46b46888b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2011 16:24:06 -0800 Subject: userns: Replace the hard to write inode_userns with inode_capable. This represents a change in strategy of how to handle user namespaces. Instead of tagging everything explicitly with a user namespace and bulking up all of the comparisons of uids and gids in the kernel, all uids and gids in use will have a mapping to a flat kuid and kgid spaces respectively. This allows much more of the existing logic to be preserved and in general allows for faster code. In this new and improved world we allow someone to utiliize capabilities over an inode if the inodes owner mapps into the capabilities holders user namespace and the user has capabilities in their user namespace. Which is simple and efficient. Moving the fs uid comparisons to be comparisons in a flat kuid space follows in later patches, something that is only significant if you are using user namespaces. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/namei.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 701954d68ac..941c4362e29 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -228,9 +228,6 @@ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - if (current_user_ns() != inode_userns(inode)) - goto other_perms; - if (likely(current_fsuid() == inode->i_uid)) mode >>= 6; else { @@ -244,7 +241,6 @@ static int acl_permission_check(struct inode *inode, int mask) mode >>= 3; } -other_perms: /* * If the DACs are ok we don't need any capability check. */ @@ -280,10 +276,10 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + if (inode_capable(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + if (inode_capable(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -293,7 +289,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + if (inode_capable(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -301,7 +297,7 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + if (inode_capable(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -1964,15 +1960,11 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) if (!(dir->i_mode & S_ISVTX)) return 0; - if (current_user_ns() != inode_userns(inode)) - goto other_userns; if (inode->i_uid == fsuid) return 0; if (dir->i_uid == fsuid) return 0; - -other_userns: - return !ns_capable(inode_userns(inode), CAP_FOWNER); + return !inode_capable(inode, CAP_FOWNER); } /* -- cgit v1.2.3 From 3f9f0aa687d45cbc8e7bb3cfd3aab555dcc8872e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 28 Apr 2012 14:38:32 -0700 Subject: VFS: clean up and simplify getname_flags() This removes a number of silly games around strncpy_from_user() in do_getname(), and removes that helper function entirely. We instead make getname_flags() just use strncpy_from_user() properly directly. Removing the wrapper function simplifies things noticeably, mostly because we no longer play the unnecessary games with segments (x86 strncpy_from_user() no longer needs the hack), but also because the empty path handling is just much more obvious. The return value of "strncpy_to_user()" is much more obvious than checking an odd error return case from do_getname(). [ non-x86 architectures were notified of this change several weeks ago, since it is possible that they have copied the old broken x86 strncpy_from_user. But nobody reacted, so .. See http://www.spinics.net/lists/linux-arch/msg17313.html for details ] Cc: linux-arch@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/namei.c | 58 ++++++++++++++++++++++++---------------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 0062dd17eb5..494b1caa25f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -116,47 +116,37 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -static int do_getname(const char __user *filename, char *page) -{ - int retval; - unsigned long len = PATH_MAX; - - if (!segment_eq(get_fs(), KERNEL_DS)) { - if ((unsigned long) filename >= TASK_SIZE) - return -EFAULT; - if (TASK_SIZE - (unsigned long) filename < PATH_MAX) - len = TASK_SIZE - (unsigned long) filename; - } - - retval = strncpy_from_user(page, filename, len); - if (retval > 0) { - if (retval < len) - return 0; - return -ENAMETOOLONG; - } else if (!retval) - retval = -ENOENT; - return retval; -} - static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *result = __getname(); - int retval; + char *result = __getname(), *err; + int len; - if (!result) + if (unlikely(!result)) return ERR_PTR(-ENOMEM); - retval = do_getname(filename, result); - if (retval < 0) { - if (retval == -ENOENT && empty) + len = strncpy_from_user(result, filename, PATH_MAX); + err = ERR_PTR(len); + if (unlikely(len < 0)) + goto error; + + /* The empty path is special. */ + if (unlikely(!len)) { + if (empty) *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(result); - return ERR_PTR(retval); - } + err = ERR_PTR(-ENOENT); + if (!(flags & LOOKUP_EMPTY)) + goto error; } - audit_getname(result); - return result; + + err = ERR_PTR(-ENAMETOOLONG); + if (likely(len < PATH_MAX)) { + audit_getname(result); + return result; + } + +error: + __putname(result); + return err; } char *getname(const char __user * filename) -- cgit v1.2.3 From 8e96e3b7b8407be794ab1fd8e4b332818a358e78 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 3 Mar 2012 21:17:15 -0800 Subject: userns: Use uid_eq gid_eq helpers when comparing kuids and kgids in the vfs Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 941c4362e29..86512b4d38f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -228,7 +228,7 @@ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - if (likely(current_fsuid() == inode->i_uid)) + if (likely(uid_eq(current_fsuid(), inode->i_uid))) mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { @@ -1956,13 +1956,13 @@ static int user_path_parent(int dfd, const char __user *path, */ static inline int check_sticky(struct inode *dir, struct inode *inode) { - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); if (!(dir->i_mode & S_ISVTX)) return 0; - if (inode->i_uid == fsuid) + if (uid_eq(inode->i_uid, fsuid)) return 0; - if (dir->i_uid == fsuid) + if (uid_eq(dir->i_uid, fsuid)) return 0; return !inode_capable(inode, CAP_FOWNER); } -- cgit v1.2.3 From e419b4cc585680940bc42f8ca8a071d6023fb1bb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 3 May 2012 10:16:43 -0700 Subject: vfs: make word-at-a-time accesses handle a non-existing page It turns out that there are more cases than CONFIG_DEBUG_PAGEALLOC that can have holes in the kernel address space: it seems to happen easily with Xen, and it looks like the AMD gart64 code will also punch holes dynamically. Actually hitting that case is still very unlikely, so just do the access, and take an exception and fix it up for the very unlikely case of it being a page-crosser with no next page. And hey, this abstraction might even help other architectures that have other issues with unaligned word accesses than the possible missing next page. IOW, this could do the byte order magic too. Peter Anvin fixed a thinko in the shifting for the exception case. Reported-and-tested-by: Jana Saout Cc: Peter Anvin Signed-off-by: Linus Torvalds --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 0062dd17eb5..c42791914f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1429,7 +1429,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) unsigned long hash = 0; for (;;) { - a = *(unsigned long *)name; + a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; hash += a; @@ -1459,7 +1459,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) do { hash = (hash + a) * 9; len += sizeof(unsigned long); - a = *(unsigned long *)(name+len); + a = load_unaligned_zeropad(name+len); /* Do we have any NUL or '/' bytes in this word? */ mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); } while (!mask); -- cgit v1.2.3 From 12f8ad4b0533d9212cb1d5e58ed73d2170114785 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 4 May 2012 14:59:14 -0700 Subject: vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces The calling conventions for __d_lookup_rcu() and dentry_cmp() are annoying in different ways, and there is actually one single underlying reason for both of the annoyances. The fundamental reason is that we do the returned dentry sequence number check inside __d_lookup_rcu() instead of doing it in the caller. This results in two annoyances: - __d_lookup_rcu() now not only needs to return the dentry and the sequence number that goes along with the lookup, it also needs to return the inode pointer that was validated by that sequence number check. - and because we did the sequence number check early (to validate the name pointer and length) we also couldn't just pass the dentry itself to dentry_cmp(), we had to pass the counted string that contained the name. So that sequence number decision caused two separate ugly calling conventions. Both of these problems would be solved if we just did the sequence number check in the caller instead. There's only one caller, and that caller already has to do the sequence number check for the parent anyway, so just do that. That allows us to stop returning the dentry->d_inode in that in-out argument (pointer-to-pointer-to-inode), so we can make the inode argument just a regular input inode pointer. The caller can just load the inode from dentry->d_inode, and then do the sequence number check after that to make sure that it's synchronized with the name we looked up. And it allows us to just pass in the dentry to dentry_cmp(), which is what all the callers really wanted. Sure, dentry_cmp() has to be a bit careful about the dentry (which is not stable during RCU lookup), but that's actually very simple. And now that dentry_cmp() can clearly see that the first string argument is a dentry, we can use the direct word access for that, instead of the careful unaligned zero-padding. The dentry name is always properly aligned, since it is a single path component that is either embedded into the dentry itself, or was allocated with kmalloc() (see __d_alloc). Finally, this also uninlines the nasty slow-case for dentry comparisons: that one *does* need to do a sequence number check, since it will call in to the low-level filesystems, and we want to give those a stable inode pointer and path component length/start arguments. Doing an extra sequence check for that slow case is not a problem, though. Signed-off-by: Linus Torvalds --- fs/namei.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index c42791914f8..46bd0045575 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1154,12 +1154,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; - dentry = __d_lookup_rcu(parent, name, &seq, inode); + dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); if (!dentry) goto unlazy; - /* Memory barrier in read_seqcount_begin of child is enough */ + /* + * This sequence count validates that the inode matches + * the dentry name information from lookup. + */ + *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, seq)) + return -ECHILD; + + /* + * This sequence count validates that the parent had no + * changes while we did the lookup of the dentry above. + * + * The memory barrier in read_seqcount_begin of child is + * enough, we can use __read_seqcount_retry here. + */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; nd->seq = seq; -- cgit v1.2.3 From 446969084d33a4064a39d280806da642c54ba4ac Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 23 May 2012 20:12:50 -0700 Subject: kernel: Move REPEAT_BYTE definition into linux/kernel.h And make sure that everything using it explicitly includes that header file. Signed-off-by: David S. Miller --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index f9e883c1b85..8d2ba420e42 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 36126f8f2ed8168eb13aa0662b9b9585cba100a9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 10:43:17 -0700 Subject: word-at-a-time: make the interfaces truly generic This changes the interfaces in to be a bit more complicated, but a lot more generic. In particular, it allows us to really do the operations efficiently on both little-endian and big-endian machines, pretty much regardless of machine details. For example, if you can rely on a fast population count instruction on your architecture, this will allow you to make your optimized file with that. NOTE! The "generic" version in include/asm-generic/word-at-a-time.h is not truly generic, it actually only works on big-endian. Why? Because on little-endian the generic algorithms are wasteful, since you can inevitably do better. The x86 implementation is an example of that. (The only truly non-generic part of the asm-generic implementation is the "find_zero()" function, and you could make a little-endian version of it. And if the Kbuild infrastructure allowed us to pick a particular header file, that would be lovely) The functions are as follows: - WORD_AT_A_TIME_CONSTANTS: specific constants that the algorithm uses. - has_zero(): take a word, and determine if it has a zero byte in it. It gets the word, the pointer to the constant pool, and a pointer to an intermediate "data" field it can set. This is the "quick-and-dirty" zero tester: it's what is run inside the hot loops. - "prep_zero_mask()": take the word, the data that has_zero() produced, and the constant pool, and generate an *exact* mask of which byte had the first zero. This is run directly *outside* the loop, and allows the "has_zero()" function to answer the "is there a zero byte" question without necessarily getting exactly *which* byte is the first one to contain a zero. If you do multiple byte lookups concurrently (eg "hash_name()", which looks for both NUL and '/' bytes), after you've done the prep_zero_mask() phase, the result of those can be or'ed together to get the "either or" case. - The result from "prep_zero_mask()" can then be fed into "find_zero()" (to find the byte offset of the first byte that was zero) or into "zero_bytemask()" (to find the bytemask of the bytes preceding the zero byte). The existence of zero_bytemask() is optional, and is not necessary for the normal string routines. But dentry name hashing needs it, so if you enable DENTRY_WORD_AT_A_TIME you need to expose it. This changes the generic strncpy_from_user() function and the dentry hashing functions to use these modified word-at-a-time interfaces. This gets us back to the optimized state of the x86 strncpy that we lost in the previous commit when moving over to the generic version. Signed-off-by: Linus Torvalds --- fs/namei.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 93ff12b1a1d..c651f02c9fe 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1452,7 +1452,8 @@ EXPORT_SYMBOL(full_name_hash); */ static inline unsigned long hash_name(const char *name, unsigned int *hashp) { - unsigned long a, mask, hash, len; + unsigned long a, b, adata, bdata, mask, hash, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; hash = a = 0; len = -sizeof(unsigned long); @@ -1460,17 +1461,18 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) hash = (hash + a) * 9; len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); - /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); - } while (!mask); - - /* The mask *below* the first high bit set */ - mask = (mask - 1) & ~mask; - mask >>= 7; - hash += a & mask; + b = a ^ REPEAT_BYTE('/'); + } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); + + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + + mask = create_zero_mask(adata | bdata); + + hash += a & zero_bytemask(mask); *hashp = fold_hash(hash); - return len + count_masked_bytes(mask); + return len + find_zero(mask); } #else -- cgit v1.2.3 From 962830df366b66e71849040770ae6ba55a8b4aec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 8 May 2012 13:32:02 +0930 Subject: brlocks/lglocks: API cleanups lglocks and brlocks are currently generated with some complicated macros in lglock.h. But there's no reason to not just use common utility functions and put all the data into a common data structure. In preparation, this patch changes the API to look more like normal function calls with pointers, not magic macros. The patch is rather large because I move over all users in one go to keep it bisectable. This impacts the VFS somewhat in terms of lines changed. But no actual behaviour change. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Andi Kleen Cc: Al Viro Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell Signed-off-by: Al Viro --- fs/namei.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index c651f02c9fe..93ac9323b1f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -449,7 +449,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) mntget(nd->path.mnt); rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); nd->flags &= ~LOOKUP_RCU; return 0; @@ -507,14 +507,14 @@ static int complete_walk(struct nameidata *nd) if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { spin_unlock(&dentry->d_lock); rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return -ECHILD; } BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); mntget(nd->path.mnt); rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -681,15 +681,15 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); parent = mnt->mnt_parent; if (&parent->mnt == path->mnt) { - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); @@ -947,7 +947,7 @@ failed: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return -ECHILD; } @@ -1265,7 +1265,7 @@ static void terminate_walk(struct nameidata *nd) if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); } } @@ -1620,7 +1620,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } else { @@ -1633,7 +1633,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (*name=='/') { if (flags & LOOKUP_RCU) { - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); set_root_rcu(nd); } else { @@ -1646,7 +1646,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); do { @@ -1682,7 +1682,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (fput_needed) *fp = file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); } else { path_get(&file->f_path); -- cgit v1.2.3 From 697f514df10b0f46bcd7596c1be18b7e2e9b28bb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:05 +0200 Subject: vfs: split do_lookup() Split do_lookup() into two functions: lookup_fast() - does cached lookup without i_mutex lookup_slow() - does lookup with i_mutex Both follow managed dentries. The new functions are needed by atomic_open. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 59 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 14 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 93ac9323b1f..7f4ab820811 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1125,8 +1125,8 @@ static struct dentry *__lookup_hash(struct qstr *name, * small and for now I'd prefer to have fast path as straight as possible. * It _is_ time-critical. */ -static int do_lookup(struct nameidata *nd, struct qstr *name, - struct path *path, struct inode **inode) +static int lookup_fast(struct nameidata *nd, struct qstr *name, + struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; @@ -1208,7 +1208,7 @@ unlazy: goto need_lookup; } } -done: + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1222,6 +1222,17 @@ done: return 0; need_lookup: + return 1; +} + +/* Fast lookup failed, do it the slow way */ +static int lookup_slow(struct nameidata *nd, struct qstr *name, + struct path *path) +{ + struct dentry *dentry, *parent; + int err; + + parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); mutex_lock(&parent->d_inode->i_mutex); @@ -1229,7 +1240,16 @@ need_lookup: mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); - goto done; + path->mnt = nd->path.mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + if (err) + nd->flags |= LOOKUP_JUMPED; + return 0; } static inline int may_lookup(struct nameidata *nd) @@ -1301,21 +1321,26 @@ static inline int walk_component(struct nameidata *nd, struct path *path, */ if (unlikely(type != LAST_NORM)) return handle_dots(nd, type); - err = do_lookup(nd, name, path, &inode); + err = lookup_fast(nd, name, path, &inode); if (unlikely(err)) { - terminate_walk(nd); - return err; - } - if (!inode) { - path_to_nameidata(path, nd); - terminate_walk(nd); - return -ENOENT; + if (err < 0) + goto out_err; + + err = lookup_slow(nd, name, path); + if (err < 0) + goto out_err; + + inode = path->dentry->d_inode; } + err = -ENOENT; + if (!inode) + goto out_path_put; + if (should_follow_link(inode, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { - terminate_walk(nd); - return -ECHILD; + err = -ECHILD; + goto out_err; } } BUG_ON(inode != path->dentry->d_inode); @@ -1324,6 +1349,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path, path_to_nameidata(path, nd); nd->inode = inode; return 0; + +out_path_put: + path_to_nameidata(path, nd); +out_err: + terminate_walk(nd); + return err; } /* -- cgit v1.2.3 From e276ae672fa2d727721b1a5a2508ff34bac85439 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:06 +0200 Subject: vfs: do_last(): make exit RCU safe Allow returning from do_last() with LOOKUP_RCU still set on the "out:" and "exit:" labels. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 7f4ab820811..edc18cd63a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2383,7 +2383,7 @@ common: out: if (want_write) mnt_drop_write(nd->path.mnt); - path_put(&nd->path); + terminate_walk(nd); return filp; exit_mutex_unlock: -- cgit v1.2.3 From a1eb33153090549e622ab42cb375af06614dd7a8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:07 +0200 Subject: vfs: do_last(): inline walk_component() Copy walk_component() into do_lookup(). Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index edc18cd63a8..f6b31c94c11 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2200,6 +2200,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, int want_write = 0; int acc_mode = op->acc_mode; struct file *filp; + struct inode *inode; int error; nd->flags &= ~LOOKUP_PARENT; @@ -2237,12 +2238,36 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) symlink_ok = 1; /* we _can_ be in RCU mode here */ - error = walk_component(nd, path, &nd->last, LAST_NORM, - !symlink_ok); - if (error < 0) - return ERR_PTR(error); - if (error) /* symlink */ + error = lookup_fast(nd, &nd->last, path, &inode); + if (unlikely(error)) { + if (error < 0) + goto exit; + + error = lookup_slow(nd, &nd->last, path); + if (error < 0) + goto exit; + + inode = path->dentry->d_inode; + } + error = -ENOENT; + if (!inode) { + path_to_nameidata(path, nd); + goto exit; + } + + if (should_follow_link(inode, !symlink_ok)) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, path->dentry))) { + error = -ECHILD; + goto exit; + } + } + BUG_ON(inode != path->dentry->d_inode); return NULL; + } + path_to_nameidata(path, nd); + nd->inode = inode; + /* sayonara */ error = complete_walk(nd); if (error) -- cgit v1.2.3 From decf3400879d02d0eafedea52c7f208587be062a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:08 +0200 Subject: vfs: do_last(): use inode variable Use helper variable instead of path->dentry->d_inode before complete_walk(). This will allow this code to be used in RCU mode. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index f6b31c94c11..41445e7fd33 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2358,15 +2358,17 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) nd->flags |= LOOKUP_JUMPED; + BUG_ON(nd->flags & LOOKUP_RCU); + inode = path->dentry->d_inode; error = -ENOENT; - if (!path->dentry->d_inode) + if (!inode) goto exit_dput; - if (path->dentry->d_inode->i_op->follow_link) + if (inode->i_op->follow_link) return NULL; path_to_nameidata(path, nd); - nd->inode = path->dentry->d_inode; + nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) -- cgit v1.2.3 From d45ea86792db9679ed010b2c3df3db32b2ce5bde Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:09 +0200 Subject: vfs: make follow_link check RCU safe This will allow this code to be used in RCU mode. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 41445e7fd33..c6b996817bb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2201,6 +2201,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, int acc_mode = op->acc_mode; struct file *filp; struct inode *inode; + int symlink_ok = 0; int error; nd->flags &= ~LOOKUP_PARENT; @@ -2232,7 +2233,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } if (!(open_flag & O_CREAT)) { - int symlink_ok = 0; if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) @@ -2364,8 +2364,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (!inode) goto exit_dput; - if (inode->i_op->follow_link) + if (should_follow_link(inode, !symlink_ok)) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, path->dentry))) { + error = -ECHILD; + goto exit; + } + } + BUG_ON(inode != path->dentry->d_inode); return NULL; + } path_to_nameidata(path, nd); nd->inode = inode; -- cgit v1.2.3 From 54c33e7f95284539e52ec2d99dcdf6efd29b247f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:10 +0200 Subject: vfs: do_last(): make ENOENT exit RCU safe This will allow this code to be used in RCU mode. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index c6b996817bb..a7e994bb78c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2361,8 +2361,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, BUG_ON(nd->flags & LOOKUP_RCU); inode = path->dentry->d_inode; error = -ENOENT; - if (!inode) - goto exit_dput; + if (!inode) { + path_to_nameidata(path, nd); + goto exit; + } if (should_follow_link(inode, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { -- cgit v1.2.3 From af2f55426d1d888dcc0ba8dc9e9deb49fae38e38 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:11 +0200 Subject: vfs: do_last(): check LOOKUP_DIRECTORY Check for ENOTDIR before finishing open. This allows this code to be shared between O_CREAT and plain opens. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index a7e994bb78c..4767c0588b6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2386,6 +2386,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; + error = -ENOTDIR; + if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) + goto exit; ok: if (!S_ISREG(nd->inode->i_mode)) will_truncate = 0; -- cgit v1.2.3 From 050ac841ea90610067fec26150574be8c6077738 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:12 +0200 Subject: vfs: do_last(): only return EISDIR for O_CREAT This allows this code to be shared between O_CREAT and plain opens. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 4767c0588b6..90210b46b46 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2384,7 +2384,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) return ERR_PTR(error); error = -EISDIR; - if (S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) goto exit; error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) -- cgit v1.2.3 From d7fdd7f6e1afbffda03aeacb90039c092e8cacf8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:13 +0200 Subject: vfs: do_last(): add audit_inode before open This allows this code to be shared between O_CREAT and plain opens. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 90210b46b46..125386c250b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2389,6 +2389,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) goto exit; + audit_inode(pathname, nd->path.dentry); ok: if (!S_ISREG(nd->inode->i_mode)) will_truncate = 0; -- cgit v1.2.3 From 5f5daac12a4cef568d1269be0215fec0667193c1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:14 +0200 Subject: vfs: do_last() common post lookup Now the post lookup code can be shared between O_CREAT and plain opens since they are essentially the same. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 125386c250b..998d5316921 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2249,37 +2249,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } - error = -ENOENT; - if (!inode) { - path_to_nameidata(path, nd); - goto exit; - } - - if (should_follow_link(inode, !symlink_ok)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { - error = -ECHILD; - goto exit; - } - } - BUG_ON(inode != path->dentry->d_inode); - return NULL; - } - path_to_nameidata(path, nd); - nd->inode = inode; - - /* sayonara */ - error = complete_walk(nd); - if (error) - return ERR_PTR(error); - - error = -ENOTDIR; - if (nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) - goto exit; - } - audit_inode(pathname, nd->path.dentry); - goto ok; + goto finish_lookup; } /* create side of things */ @@ -2360,6 +2330,8 @@ static struct file *do_last(struct nameidata *nd, struct path *path, BUG_ON(nd->flags & LOOKUP_RCU); inode = path->dentry->d_inode; +finish_lookup: + /* we _can_ be in RCU mode here */ error = -ENOENT; if (!inode) { path_to_nameidata(path, nd); -- cgit v1.2.3 From 16b1c1cd71176ab0a76b26818fbf12db9183ed57 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 May 2012 17:30:19 +0200 Subject: vfs: retry last component if opening stale dentry NFS optimizes away d_revalidates for last component of open. This means that open itself can find the dentry stale. This patch allows the filesystem to return EOPENSTALE and the VFS will retry the lookup on just the last component if possible. If the lookup was done using RCU mode, including the last component, then this is not possible since the parent dentry is lost. In this case fall back to non-RCU lookup. Currently this is not used since NFS will always leave RCU mode. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/namei.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 998d5316921..7d694194024 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2202,6 +2202,8 @@ static struct file *do_last(struct nameidata *nd, struct path *path, struct file *filp; struct inode *inode; int symlink_ok = 0; + struct path save_parent = { .dentry = NULL, .mnt = NULL }; + bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; @@ -2267,6 +2269,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (nd->last.name[nd->last.len]) goto exit; +retry_lookup: mutex_lock(&dir->d_inode->i_mutex); dentry = lookup_hash(nd); @@ -2349,12 +2352,21 @@ finish_lookup: return NULL; } - path_to_nameidata(path, nd); + if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { + path_to_nameidata(path, nd); + } else { + save_parent.dentry = nd->path.dentry; + save_parent.mnt = mntget(path->mnt); + nd->path.dentry = path->dentry; + + } nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); - if (error) + if (error) { + path_put(&save_parent); return ERR_PTR(error); + } error = -EISDIR; if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) goto exit; @@ -2377,6 +2389,20 @@ common: if (error) goto exit; filp = nameidata_to_filp(nd); + if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) { + BUG_ON(save_parent.dentry != dir); + path_put(&nd->path); + nd->path = save_parent; + nd->inode = dir->d_inode; + save_parent.mnt = NULL; + save_parent.dentry = NULL; + if (want_write) { + mnt_drop_write(nd->path.mnt); + want_write = 0; + } + retried = true; + goto retry_lookup; + } if (!IS_ERR(filp)) { error = ima_file_check(filp, op->acc_mode); if (error) { @@ -2396,6 +2422,7 @@ common: out: if (want_write) mnt_drop_write(nd->path.mnt); + path_put(&save_parent); terminate_walk(nd); return filp; @@ -2459,6 +2486,12 @@ out: if (base) fput(base); release_open_intent(nd); + if (filp == ERR_PTR(-EOPENSTALE)) { + if (flags & LOOKUP_RCU) + filp = ERR_PTR(-ECHILD); + else + filp = ERR_PTR(-ESTALE); + } return filp; out_filp: -- cgit v1.2.3