aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c8
-rw-r--r--fs/9p/v9fs.c35
-rw-r--r--fs/9p/v9fs.h2
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c178
-rw-r--r--fs/9p/vfs_inode_dotl.c59
-rw-r--r--fs/9p/vfs_super.c4
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h4
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/inode.c19
-rw-r--r--fs/adfs/super.c27
-rw-r--r--fs/affs/affs.h15
-rw-r--r--fs/affs/amigaffs.c22
-rw-r--r--fs/affs/bitmap.c32
-rw-r--r--fs/affs/file.c18
-rw-r--r--fs/affs/inode.c25
-rw-r--r--fs/affs/namei.c4
-rw-r--r--fs/affs/super.c91
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/dir.c14
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/server.c10
-rw-r--r--fs/afs/super.c8
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/aio.c73
-rw-r--r--fs/attr.c16
-rw-r--r--fs/autofs4/autofs_i.h8
-rw-r--r--fs/autofs4/dev-ioctl.c26
-rw-r--r--fs/autofs4/expire.c45
-rw-r--r--fs/autofs4/inode.c24
-rw-r--r--fs/autofs4/root.c93
-rw-r--r--fs/autofs4/waitq.c8
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/befs.h4
-rw-r--r--fs/befs/linuxvfs.c36
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/bfs/file.c15
-rw-r--r--fs/bfs/inode.c13
-rw-r--r--fs/binfmt_aout.c59
-rw-r--r--fs/binfmt_elf.c182
-rw-r--r--fs/binfmt_elf_fdpic.c15
-rw-r--r--fs/binfmt_em86.c5
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/binfmt_misc.c15
-rw-r--r--fs/binfmt_script.c12
-rw-r--r--fs/binfmt_som.c5
-rw-r--r--fs/bio-integrity.c44
-rw-r--r--fs/bio.c248
-rw-r--r--fs/block_dev.c116
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c10
-rw-r--r--fs/btrfs/async-thread.c9
-rw-r--r--fs/btrfs/backref.c355
-rw-r--r--fs/btrfs/backref.h21
-rw-r--r--fs/btrfs/btrfs_inode.h33
-rw-r--r--fs/btrfs/check-integrity.c54
-rw-r--r--fs/btrfs/compression.c20
-rw-r--r--fs/btrfs/ctree.c1197
-rw-r--r--fs/btrfs/ctree.h678
-rw-r--r--fs/btrfs/delayed-inode.c60
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c203
-rw-r--r--fs/btrfs/delayed-ref.h68
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c570
-rw-r--r--fs/btrfs/disk-io.h14
-rw-r--r--fs/btrfs/extent-tree.c994
-rw-r--r--fs/btrfs/extent_io.c251
-rw-r--r--fs/btrfs/extent_io.h27
-rw-r--r--fs/btrfs/extent_map.c64
-rw-r--r--fs/btrfs/extent_map.h11
-rw-r--r--fs/btrfs/file-item.c38
-rw-r--r--fs/btrfs/file.c850
-rw-r--r--fs/btrfs/free-space-cache.c83
-rw-r--r--fs/btrfs/hash.h10
-rw-r--r--fs/btrfs/inode-item.c285
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c1378
-rw-r--r--fs/btrfs/ioctl.c976
-rw-r--r--fs/btrfs/ioctl.h143
-rw-r--r--fs/btrfs/locking.c16
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c200
-rw-r--r--fs/btrfs/ordered-data.h21
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c1608
-rw-r--r--fs/btrfs/reada.c49
-rw-r--r--fs/btrfs/relocation.c56
-rw-r--r--fs/btrfs/root-tree.c104
-rw-r--r--fs/btrfs/scrub.c1883
-rw-r--r--fs/btrfs/send.c4695
-rw-r--r--fs/btrfs/send.h134
-rw-r--r--fs/btrfs/struct-funcs.c196
-rw-r--r--fs/btrfs/super.c200
-rw-r--r--fs/btrfs/transaction.c526
-rw-r--r--fs/btrfs/transaction.h30
-rw-r--r--fs/btrfs/tree-log.c1004
-rw-r--r--fs/btrfs/ulist.c7
-rw-r--r--fs/btrfs/ulist.h9
-rw-r--r--fs/btrfs/volumes.c1122
-rw-r--r--fs/btrfs/volumes.h41
-rw-r--r--fs/btrfs/xattr.c13
-rw-r--r--fs/btrfs/zlib.c8
-rw-r--r--fs/buffer.c255
-rw-r--r--fs/cachefiles/interface.c57
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/cachefiles/namei.c5
-rw-r--r--fs/cachefiles/rdwr.c122
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/addr.c84
-rw-r--r--fs/ceph/caps.c20
-rw-r--r--fs/ceph/debugfs.c1
-rw-r--r--fs/ceph/dir.c50
-rw-r--r--fs/ceph/export.c24
-rw-r--r--fs/ceph/file.c157
-rw-r--r--fs/ceph/inode.c34
-rw-r--r--fs/ceph/ioctl.c11
-rw-r--r--fs/ceph/mds_client.c37
-rw-r--r--fs/ceph/snap.c18
-rw-r--r--fs/ceph/super.c49
-rw-r--r--fs/ceph/super.h10
-rw-r--r--fs/ceph/xattr.c1
-rw-r--r--fs/char_dev.c18
-rw-r--r--fs/cifs/Kconfig48
-rw-r--r--fs/cifs/Makefile3
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/cache.c2
-rw-r--r--fs/cifs/cifs_debug.c62
-rw-r--r--fs/cifs/cifs_debug.h72
-rw-r--r--fs/cifs/cifs_dfs_ref.c9
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifs_unicode.c84
-rw-r--r--fs/cifs/cifs_unicode.h6
-rw-r--r--fs/cifs/cifsacl.c814
-rw-r--r--fs/cifs/cifsacl.h66
-rw-r--r--fs/cifs/cifsencrypt.c117
-rw-r--r--fs/cifs/cifsfs.c120
-rw-r--r--fs/cifs/cifsfs.h9
-rw-r--r--fs/cifs/cifsglob.h474
-rw-r--r--fs/cifs/cifspdu.h2
-rw-r--r--fs/cifs/cifsproto.h392
-rw-r--r--fs/cifs/cifssmb.c662
-rw-r--r--fs/cifs/connect.c960
-rw-r--r--fs/cifs/dir.c573
-rw-r--r--fs/cifs/file.c1338
-rw-r--r--fs/cifs/inode.c919
-rw-r--r--fs/cifs/ioctl.c38
-rw-r--r--fs/cifs/link.c120
-rw-r--r--fs/cifs/misc.c63
-rw-r--r--fs/cifs/netmisc.c23
-rw-r--r--fs/cifs/nterr.c6
-rw-r--r--fs/cifs/nterr.h22
-rw-r--r--fs/cifs/ntlmssp.h10
-rw-r--r--fs/cifs/readdir.c264
-rw-r--r--fs/cifs/sess.c15
-rw-r--r--fs/cifs/smb1ops.c726
-rw-r--r--fs/cifs/smb2file.c290
-rw-r--r--fs/cifs/smb2glob.h58
-rw-r--r--fs/cifs/smb2inode.c257
-rw-r--r--fs/cifs/smb2maperror.c2479
-rw-r--r--fs/cifs/smb2misc.c583
-rw-r--r--fs/cifs/smb2ops.c720
-rw-r--r--fs/cifs/smb2pdu.c2211
-rw-r--r--fs/cifs/smb2pdu.h857
-rw-r--r--fs/cifs/smb2proto.h163
-rw-r--r--fs/cifs/smb2status.h1782
-rw-r--r--fs/cifs/smb2transport.c331
-rw-r--r--fs/cifs/smbencrypt.c14
-rw-r--r--fs/cifs/transport.c383
-rw-r--r--fs/cifs/xattr.c24
-rw-r--r--fs/coda/cache.c10
-rw-r--r--fs/coda/dir.c14
-rw-r--r--fs/coda/inode.c37
-rw-r--r--fs/compat.c122
-rw-r--r--fs/compat_binfmt_elf.c7
-rw-r--r--fs/compat_ioctl.c40
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/configfs/inode.c4
-rw-r--r--fs/coredump.c693
-rw-r--r--fs/coredump.h6
-rw-r--r--fs/cramfs/inode.c6
-rw-r--r--fs/dcache.c93
-rw-r--r--fs/debugfs/file.c76
-rw-r--r--fs/debugfs/inode.c124
-rw-r--r--fs/devpts/inode.c67
-rw-r--r--fs/direct-io.c15
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/ast.c4
-rw-r--r--fs/dlm/config.c86
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/debug_fs.c103
-rw-r--r--fs/dlm/dir.c287
-rw-r--r--fs/dlm/dir.h7
-rw-r--r--fs/dlm/dlm_internal.h109
-rw-r--r--fs/dlm/lock.c1308
-rw-r--r--fs/dlm/lock.h5
-rw-r--r--fs/dlm/lockspace.c60
-rw-r--r--fs/dlm/lowcomms.c220
-rw-r--r--fs/dlm/lowcomms.h2
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/member.c17
-rw-r--r--fs/dlm/netlink.c8
-rw-r--r--fs/dlm/rcom.c149
-rw-r--r--fs/dlm/rcom.h1
-rw-r--r--fs/dlm/recover.c332
-rw-r--r--fs/dlm/recover.h2
-rw-r--r--fs/dlm/recoverd.c41
-rw-r--r--fs/dlm/recoverd.h1
-rw-r--r--fs/dlm/user.c7
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/dentry.c20
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h38
-rw-r--r--fs/ecryptfs/file.c100
-rw-r--r--fs/ecryptfs/inode.c109
-rw-r--r--fs/ecryptfs/kthread.c79
-rw-r--r--fs/ecryptfs/main.c42
-rw-r--r--fs/ecryptfs/messaging.c135
-rw-r--r--fs/ecryptfs/miscdev.c98
-rw-r--r--fs/ecryptfs/mmap.c51
-rw-r--r--fs/efs/efs.h2
-rw-r--r--fs/efs/inode.c4
-rw-r--r--fs/efs/namei.c3
-rw-r--r--fs/efs/super.c5
-rw-r--r--fs/eventfd.c20
-rw-r--r--fs/eventpoll.c75
-rw-r--r--fs/exec.c799
-rw-r--r--fs/exofs/inode.c51
-rw-r--r--fs/exofs/namei.c4
-rw-r--r--fs/exofs/ore.c19
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/exofs/super.c20
-rw-r--r--fs/exofs/sys.c7
-rw-r--r--fs/exportfs/expfs.c39
-rw-r--r--fs/ext2/acl.c32
-rw-r--r--fs/ext2/balloc.c16
-rw-r--r--fs/ext2/ialloc.c1
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext2/super.c54
-rw-r--r--fs/ext3/acl.c32
-rw-r--r--fs/ext3/balloc.c9
-rw-r--r--fs/ext3/bitmap.c12
-rw-r--r--fs/ext3/dir.c9
-rw-r--r--fs/ext3/fsync.c9
-rw-r--r--fs/ext3/inode.c30
-rw-r--r--fs/ext3/namei.c48
-rw-r--r--fs/ext3/namei.h19
-rw-r--r--fs/ext3/super.c47
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c37
-rw-r--r--fs/ext4/balloc.c71
-rw-r--r--fs/ext4/bitmap.c24
-rw-r--r--fs/ext4/dir.c118
-rw-r--r--fs/ext4/ext4.h249
-rw-r--r--fs/ext4/ext4_extents.h40
-rw-r--r--fs/ext4/ext4_jbd2.c12
-rw-r--r--fs/ext4/ext4_jbd2.h32
-rw-r--r--fs/ext4/extents.c864
-rw-r--r--fs/ext4/extents_status.c500
-rw-r--r--fs/ext4/extents_status.h45
-rw-r--r--fs/ext4/file.c462
-rw-r--r--fs/ext4/fsync.c111
-rw-r--r--fs/ext4/ialloc.c43
-rw-r--r--fs/ext4/indirect.c23
-rw-r--r--fs/ext4/inline.c1884
-rw-r--r--fs/ext4/inode.c836
-rw-r--r--fs/ext4/ioctl.c41
-rw-r--r--fs/ext4/mballoc.c227
-rw-r--r--fs/ext4/mballoc.h5
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c521
-rw-r--r--fs/ext4/namei.c613
-rw-r--r--fs/ext4/page-io.c179
-rw-r--r--fs/ext4/resize.c459
-rw-r--r--fs/ext4/super.c553
-rw-r--r--fs/ext4/symlink.c4
-rw-r--r--fs/ext4/xattr.c121
-rw-r--r--fs/ext4/xattr.h158
-rw-r--r--fs/f2fs/Kconfig53
-rw-r--r--fs/f2fs/Makefile7
-rw-r--r--fs/f2fs/acl.c412
-rw-r--r--fs/f2fs/acl.h57
-rw-r--r--fs/f2fs/checkpoint.c793
-rw-r--r--fs/f2fs/data.c718
-rw-r--r--fs/f2fs/debug.c353
-rw-r--r--fs/f2fs/dir.c674
-rw-r--r--fs/f2fs/f2fs.h1087
-rw-r--r--fs/f2fs/file.c646
-rw-r--r--fs/f2fs/gc.c716
-rw-r--r--fs/f2fs/gc.h117
-rw-r--r--fs/f2fs/hash.c101
-rw-r--r--fs/f2fs/inode.c272
-rw-r--r--fs/f2fs/namei.c503
-rw-r--r--fs/f2fs/node.c1760
-rw-r--r--fs/f2fs/node.h353
-rw-r--r--fs/f2fs/recovery.c377
-rw-r--r--fs/f2fs/segment.c1757
-rw-r--r--fs/f2fs/segment.h631
-rw-r--r--fs/f2fs/super.c701
-rw-r--r--fs/f2fs/xattr.c443
-rw-r--r--fs/f2fs/xattr.h145
-rw-r--r--fs/fat/Makefile2
-rw-r--r--fs/fat/cache.c10
-rw-r--r--fs/fat/dir.c320
-rw-r--r--fs/fat/fat.h114
-rw-r--r--fs/fat/fatent.c13
-rw-r--r--fs/fat/file.c21
-rw-r--r--fs/fat/inode.c278
-rw-r--r--fs/fat/misc.c13
-rw-r--r--fs/fat/namei_msdos.c48
-rw-r--r--fs/fat/namei_vfat.c62
-rw-r--r--fs/fat/nfs.c101
-rw-r--r--fs/fcntl.c193
-rw-r--r--fs/fhandle.c23
-rw-r--r--fs/file.c573
-rw-r--r--fs/file_table.c171
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/freevxfs/vxfs_inode.c4
-rw-r--r--fs/freevxfs/vxfs_lookup.c4
-rw-r--r--fs/freevxfs/vxfs_super.c5
-rw-r--r--fs/fs-writeback.c32
-rw-r--r--fs/fs_struct.c56
-rw-r--r--fs/fscache/cache.c8
-rw-r--r--fs/fscache/cookie.c78
-rw-r--r--fs/fscache/internal.h15
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/object.c101
-rw-r--r--fs/fscache/operation.c140
-rw-r--r--fs/fscache/page.c195
-rw-r--r--fs/fscache/stats.c17
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/cuse.c40
-rw-r--r--fs/fuse/dev.c13
-rw-r--r--fs/fuse/dir.c120
-rw-r--r--fs/fuse/file.c33
-rw-r--r--fs/fuse/fuse_i.h7
-rw-r--r--fs/fuse/inode.c73
-rw-r--r--fs/generic_acl.c4
-rw-r--r--fs/gfs2/acl.c14
-rw-r--r--fs/gfs2/aops.c27
-rw-r--r--fs/gfs2/bmap.c75
-rw-r--r--fs/gfs2/dentry.c6
-rw-r--r--fs/gfs2/dir.c16
-rw-r--r--fs/gfs2/export.c4
-rw-r--r--fs/gfs2/file.c136
-rw-r--r--fs/gfs2/glock.c139
-rw-r--r--fs/gfs2/glock.h54
-rw-r--r--fs/gfs2/glops.c20
-rw-r--r--fs/gfs2/incore.h70
-rw-r--r--fs/gfs2/inode.c300
-rw-r--r--fs/gfs2/lock_dlm.c28
-rw-r--r--fs/gfs2/lops.c25
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/ops_fstype.c52
-rw-r--r--fs/gfs2/quota.c130
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/rgrp.c1391
-rw-r--r--fs/gfs2/rgrp.h28
-rw-r--r--fs/gfs2/super.c44
-rw-r--r--fs/gfs2/sys.c23
-rw-r--r--fs/gfs2/trace_gfs2.h59
-rw-r--r--fs/gfs2/trans.c12
-rw-r--r--fs/gfs2/trans.h7
-rw-r--r--fs/gfs2/util.h18
-rw-r--r--fs/gfs2/xattr.c128
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/extent.c2
-rw-r--r--fs/hfs/hfs_fs.h19
-rw-r--r--fs/hfs/inode.c46
-rw-r--r--fs/hfs/mdb.c17
-rw-r--r--fs/hfs/super.c95
-rw-r--r--fs/hfs/sysdep.c4
-rw-r--r--fs/hfsplus/bitmap.c17
-rw-r--r--fs/hfsplus/btree.c5
-rw-r--r--fs/hfsplus/catalog.c4
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hfsplus/extents.c24
-rw-r--r--fs/hfsplus/hfsplus_fs.h13
-rw-r--r--fs/hfsplus/inode.c43
-rw-r--r--fs/hfsplus/options.c15
-rw-r--r--fs/hfsplus/super.c67
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c24
-rw-r--r--fs/hostfs/hostfs_user.c1
-rw-r--r--fs/hpfs/anode.c6
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/dnode.c28
-rw-r--r--fs/hpfs/file.c20
-rw-r--r--fs/hpfs/hpfs_fn.h7
-rw-r--r--fs/hpfs/inode.c24
-rw-r--r--fs/hpfs/namei.c10
-rw-r--r--fs/hpfs/super.c26
-rw-r--r--fs/hppfs/hppfs.c28
-rw-r--r--fs/hugetlbfs/inode.c149
-rw-r--r--fs/inode.c36
-rw-r--r--fs/internal.h19
-rw-r--r--fs/ioctl.c25
-rw-r--r--fs/isofs/export.c3
-rw-r--r--fs/isofs/inode.c22
-rw-r--r--fs/isofs/isofs.h6
-rw-r--r--fs/isofs/namei.c2
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd/commit.c45
-rw-r--r--fs/jbd/journal.c12
-rw-r--r--fs/jbd/recovery.c7
-rw-r--r--fs/jbd/transaction.c68
-rw-r--r--fs/jbd2/commit.c44
-rw-r--r--fs/jbd2/journal.c13
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c94
-rw-r--r--fs/jffs2/acl.c30
-rw-r--r--fs/jffs2/dir.c16
-rw-r--r--fs/jffs2/file.c47
-rw-r--r--fs/jffs2/fs.c24
-rw-r--r--fs/jffs2/nodemgmt.c6
-rw-r--r--fs/jffs2/os-linux.h4
-rw-r--r--fs/jffs2/readinode.c13
-rw-r--r--fs/jffs2/super.c10
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/file.c10
-rw-r--r--fs/jfs/inode.c20
-rw-r--r--fs/jfs/ioctl.c43
-rw-r--r--fs/jfs/jfs_discard.c121
-rw-r--r--fs/jfs/jfs_discard.h26
-rw-r--r--fs/jfs/jfs_dmap.c126
-rw-r--r--fs/jfs/jfs_dmap.h2
-rw-r--r--fs/jfs/jfs_filsys.h3
-rw-r--r--fs/jfs/jfs_imap.c22
-rw-r--r--fs/jfs/jfs_incore.h9
-rw-r--r--fs/jfs/jfs_txnmgr.c9
-rw-r--r--fs/jfs/namei.c18
-rw-r--r--fs/jfs/super.c104
-rw-r--r--fs/jfs/xattr.c4
-rw-r--r--fs/libfs.c14
-rw-r--r--fs/lockd/clnt4xdr.c8
-rw-r--r--fs/lockd/clntproc.c17
-rw-r--r--fs/lockd/clntxdr.c10
-rw-r--r--fs/lockd/grace.c16
-rw-r--r--fs/lockd/host.c107
-rw-r--r--fs/lockd/mon.c94
-rw-r--r--fs/lockd/netns.h11
-rw-r--r--fs/lockd/svc.c61
-rw-r--r--fs/lockd/svc4proc.c14
-rw-r--r--fs/lockd/svclock.c20
-rw-r--r--fs/lockd/svcproc.c19
-rw-r--r--fs/lockd/svcsubs.c19
-rw-r--r--fs/locks.c66
-rw-r--r--fs/logfs/dev_bdev.c15
-rw-r--r--fs/logfs/dir.c4
-rw-r--r--fs/logfs/inode.c29
-rw-r--r--fs/logfs/journal.c2
-rw-r--r--fs/logfs/readwrite.c19
-rw-r--r--fs/logfs/segment.c2
-rw-r--r--fs/logfs/super.c3
-rw-r--r--fs/minix/file.c6
-rw-r--r--fs/minix/inode.c38
-rw-r--r--fs/minix/itree_v2.c3
-rw-r--r--fs/minix/namei.c4
-rw-r--r--fs/mount.h16
-rw-r--r--fs/namei.c1427
-rw-r--r--fs/namespace.c526
-rw-r--r--fs/ncpfs/dir.c14
-rw-r--r--fs/ncpfs/inode.c15
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig25
-rw-r--r--fs/nfs/Makefile24
-rw-r--r--fs/nfs/blocklayout/blocklayout.c310
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c25
-rw-r--r--fs/nfs/blocklayout/extents.c3
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback.c347
-rw-r--r--fs/nfs/callback.h9
-rw-r--r--fs/nfs/callback_proc.c48
-rw-r--r--fs/nfs/callback_xdr.c9
-rw-r--r--fs/nfs/client.c902
-rw-r--r--fs/nfs/delegation.c7
-rw-r--r--fs/nfs/delegation.h21
-rw-r--r--fs/nfs/dir.c482
-rw-r--r--fs/nfs/direct.c139
-rw-r--r--fs/nfs/dns_resolve.c9
-rw-r--r--fs/nfs/file.c278
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/fscache.h21
-rw-r--r--fs/nfs/getroot.c50
-rw-r--r--fs/nfs/idmap.c169
-rw-r--r--fs/nfs/inode.c162
-rw-r--r--fs/nfs/internal.h185
-rw-r--r--fs/nfs/mount_clnt.c9
-rw-r--r--fs/nfs/namespace.c56
-rw-r--r--fs/nfs/netns.h6
-rw-r--r--fs/nfs/nfs.h29
-rw-r--r--fs/nfs/nfs2super.c31
-rw-r--r--fs/nfs/nfs2xdr.c32
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3client.c65
-rw-r--r--fs/nfs/nfs3proc.c61
-rw-r--r--fs/nfs/nfs3super.c31
-rw-r--r--fs/nfs/nfs3xdr.c50
-rw-r--r--fs/nfs/nfs4_fs.h85
-rw-r--r--fs/nfs/nfs4client.c897
-rw-r--r--fs/nfs/nfs4file.c138
-rw-r--r--fs/nfs/nfs4filelayout.c113
-rw-r--r--fs/nfs/nfs4filelayout.h17
-rw-r--r--fs/nfs/nfs4filelayoutdev.c44
-rw-r--r--fs/nfs/nfs4getroot.c50
-rw-r--r--fs/nfs/nfs4namespace.c19
-rw-r--r--fs/nfs/nfs4proc.c1601
-rw-r--r--fs/nfs/nfs4renewd.c3
-rw-r--r--fs/nfs/nfs4session.c552
-rw-r--r--fs/nfs/nfs4session.h142
-rw-r--r--fs/nfs/nfs4state.c473
-rw-r--r--fs/nfs/nfs4super.c358
-rw-r--r--fs/nfs/nfs4sysctl.c69
-rw-r--r--fs/nfs/nfs4xdr.c212
-rw-r--r--fs/nfs/objlayout/objio_osd.c64
-rw-r--r--fs/nfs/objlayout/objlayout.c11
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/pnfs.c510
-rw-r--r--fs/nfs/pnfs.h51
-rw-r--r--fs/nfs/pnfs_dev.c27
-rw-r--r--fs/nfs/proc.c86
-rw-r--r--fs/nfs/read.c37
-rw-r--r--fs/nfs/super.c732
-rw-r--r--fs/nfs/sysctl.c26
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c210
-rw-r--r--fs/nfsd/export.c10
-rw-r--r--fs/nfsd/fault_inject.c113
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/netns.h70
-rw-r--r--fs/nfsd/nfs2acl.c5
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfs4callback.c75
-rw-r--r--fs/nfsd/nfs4idmap.c8
-rw-r--r--fs/nfsd/nfs4proc.c92
-rw-r--r--fs/nfsd/nfs4recover.c566
-rw-r--r--fs/nfsd/nfs4state.c1430
-rw-r--r--fs/nfsd/nfs4xdr.c328
-rw-r--r--fs/nfsd/nfsctl.c190
-rw-r--r--fs/nfsd/nfsd.h31
-rw-r--r--fs/nfsd/nfsfh.c5
-rw-r--r--fs/nfsd/nfsproc.c9
-rw-r--r--fs/nfsd/nfssvc.c233
-rw-r--r--fs/nfsd/nfsxdr.c11
-rw-r--r--fs/nfsd/state.h76
-rw-r--r--fs/nfsd/vfs.c165
-rw-r--r--fs/nfsd/vfs.h11
-rw-r--r--fs/nfsd/xdr4.h15
-rw-r--r--fs/nilfs2/alloc.h14
-rw-r--r--fs/nilfs2/bmap.h7
-rw-r--r--fs/nilfs2/btnode.h8
-rw-r--r--fs/nilfs2/cpfile.c10
-rw-r--r--fs/nilfs2/dat.c6
-rw-r--r--fs/nilfs2/export.h8
-rw-r--r--fs/nilfs2/file.c22
-rw-r--r--fs/nilfs2/ifile.c6
-rw-r--r--fs/nilfs2/inode.c39
-rw-r--r--fs/nilfs2/ioctl.c11
-rw-r--r--fs/nilfs2/mdt.h7
-rw-r--r--fs/nilfs2/namei.c4
-rw-r--r--fs/nilfs2/nilfs.h18
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/nilfs2/sufile.c8
-rw-r--r--fs/nilfs2/super.c20
-rw-r--r--fs/nilfs2/the_nilfs.c1
-rw-r--r--fs/nilfs2/the_nilfs.h8
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c7
-rw-r--r--fs/notify/fanotify/fanotify_user.c135
-rw-r--r--fs/notify/fdinfo.c179
-rw-r--r--fs/notify/fdinfo.h27
-rw-r--r--fs/notify/fsnotify.c3
-rw-r--r--fs/notify/group.c47
-rw-r--r--fs/notify/inode_mark.c19
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c4
-rw-r--r--fs/notify/inotify/inotify_user.c64
-rw-r--r--fs/notify/mark.c91
-rw-r--r--fs/notify/notification.c3
-rw-r--r--fs/notify/vfsmount_mark.c14
-rw-r--r--fs/ntfs/file.c19
-rw-r--r--fs/ntfs/inode.c15
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/namei.c2
-rw-r--r--fs/ntfs/super.c62
-rw-r--r--fs/ntfs/volume.h5
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/dcache.c22
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c7
-rw-r--r--fs/ocfs2/extent_map.c12
-rw-r--r--fs/ocfs2/file.c46
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c7
-rw-r--r--fs/ocfs2/localalloc.c8
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/quota_global.c43
-rw-r--r--fs/ocfs2/quota_local.c15
-rw-r--r--fs/ocfs2/refcounttree.c11
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/file.c27
-rw-r--r--fs/omfs/inode.c8
-rw-r--r--fs/omfs/omfs.h4
-rw-r--r--fs/open.c483
-rw-r--r--fs/openpromfs/inode.c9
-rw-r--r--fs/pipe.c102
-rw-r--r--fs/pnode.c5
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/posix_acl.c30
-rw-r--r--fs/proc/Makefile3
-rw-r--r--fs/proc/array.c27
-rw-r--r--fs/proc/base.c769
-rw-r--r--fs/proc/fd.c369
-rw-r--r--fs/proc/fd.h14
-rw-r--r--fs/proc/generic.c65
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h59
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c189
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_devtree.c11
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c35
-rw-r--r--fs/proc/root.c29
-rw-r--r--fs/proc/self.c59
-rw-r--r--fs/proc/stat.c14
-rw-r--r--fs/proc/task_mmu.c116
-rw-r--r--fs/proc_namespace.c7
-rw-r--r--fs/pstore/Kconfig20
-rw-r--r--fs/pstore/Makefile1
-rw-r--r--fs/pstore/ftrace.c131
-rw-r--r--fs/pstore/inode.c121
-rw-r--r--fs/pstore/internal.h53
-rw-r--r--fs/pstore/platform.c83
-rw-r--r--fs/pstore/ram.c403
-rw-r--r--fs/pstore/ram_core.c149
-rw-r--r--fs/qnx4/bitmap.c24
-rw-r--r--fs/qnx4/inode.c9
-rw-r--r--fs/qnx4/namei.c2
-rw-r--r--fs/qnx4/qnx4.h2
-rw-r--r--fs/qnx6/inode.c10
-rw-r--r--fs/qnx6/namei.c2
-rw-r--r--fs/qnx6/qnx6.h2
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/dquot.c144
-rw-r--r--fs/quota/kqid.c132
-rw-r--r--fs/quota/netlink.c10
-rw-r--r--fs/quota/quota.c42
-rw-r--r--fs/quota/quota_tree.c22
-rw-r--r--fs/quota/quota_v1.c12
-rw-r--r--fs/quota/quota_v2.c26
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c238
-rw-r--r--fs/read_write.h2
-rw-r--r--fs/readdir.c36
-rw-r--r--fs/reiserfs/bitmap.c2
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c59
-rw-r--r--fs/reiserfs/namei.c12
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c70
-rw-r--r--fs/reiserfs/xattr.c6
-rw-r--r--fs/reiserfs/xattr_acl.c24
-rw-r--r--fs/romfs/super.c7
-rw-r--r--fs/select.c41
-rw-r--r--fs/seq_file.c28
-rw-r--r--fs/signalfd.c31
-rw-r--r--fs/splice.c81
-rw-r--r--fs/squashfs/inode.c8
-rw-r--r--fs/squashfs/namei.c2
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/stat.c30
-rw-r--r--fs/statfs.c18
-rw-r--r--fs/super.c349
-rw-r--r--fs/sync.c96
-rw-r--r--fs/sysfs/bin.c2
-rw-r--r--fs/sysfs/dir.c57
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h1
-rw-r--r--fs/sysv/balloc.c18
-rw-r--r--fs/sysv/file.c5
-rw-r--r--fs/sysv/ialloc.c14
-rw-r--r--fs/sysv/inode.c35
-rw-r--r--fs/sysv/itree.c17
-rw-r--r--fs/sysv/namei.c4
-rw-r--r--fs/sysv/super.c1
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/timerfd.c45
-rw-r--r--fs/ubifs/budget.c9
-rw-r--r--fs/ubifs/commit.c8
-rw-r--r--fs/ubifs/compress.c7
-rw-r--r--fs/ubifs/debug.c652
-rw-r--r--fs/ubifs/debug.h22
-rw-r--r--fs/ubifs/dir.c14
-rw-r--r--fs/ubifs/file.c15
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c6
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/log.c14
-rw-r--r--fs/ubifs/lprops.c72
-rw-r--r--fs/ubifs/lpt.c10
-rw-r--r--fs/ubifs/lpt_commit.c58
-rw-r--r--fs/ubifs/orphan.c11
-rw-r--r--fs/ubifs/recovery.c13
-rw-r--r--fs/ubifs/replay.c35
-rw-r--r--fs/ubifs/sb.c23
-rw-r--r--fs/ubifs/scan.c15
-rw-r--r--fs/ubifs/super.c137
-rw-r--r--fs/ubifs/tnc_misc.c4
-rw-r--r--fs/ubifs/ubifs.h20
-rw-r--r--fs/udf/file.c44
-rw-r--r--fs/udf/inode.c94
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/udf/super.c163
-rw-r--r--fs/udf/truncate.c4
-rw-r--r--fs/udf/udf_sb.h4
-rw-r--r--fs/udf/udfdecl.h1
-rw-r--r--fs/ufs/balloc.c38
-rw-r--r--fs/ufs/ialloc.c20
-rw-r--r--fs/ufs/inode.c31
-rw-r--r--fs/ufs/namei.c4
-rw-r--r--fs/ufs/super.c170
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/utimes.c17
-rw-r--r--fs/xattr.c337
-rw-r--r--fs/xattr_acl.c96
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c183
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c79
-rw-r--r--fs/xfs/xfs_alloc_btree.h16
-rw-r--r--fs/xfs/xfs_aops.c208
-rw-r--r--fs/xfs/xfs_aops.h14
-rw-r--r--fs/xfs/xfs_attr.c173
-rw-r--r--fs/xfs/xfs_attr_leaf.c412
-rw-r--r--fs/xfs/xfs_attr_leaf.h27
-rw-r--r--fs/xfs/xfs_bmap.c129
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c336
-rw-r--r--fs/xfs/xfs_buf.h168
-rw-r--r--fs/xfs/xfs_buf_item.c408
-rw-r--r--fs/xfs/xfs_buf_item.h38
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c952
-rw-r--r--fs/xfs/xfs_da_btree.h46
-rw-r--r--fs/xfs/xfs_dfrag.c47
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c4
-rw-r--r--fs/xfs/xfs_dir2_block.c544
-rw-r--r--fs/xfs/xfs_dir2_data.c220
-rw-r--r--fs/xfs/xfs_dir2_leaf.c773
-rw-r--r--fs/xfs/xfs_dir2_node.c518
-rw-r--r--fs/xfs/xfs_dir2_priv.h63
-rw-r--r--fs/xfs/xfs_dir2_sf.c4
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c450
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c158
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c527
-rw-r--r--fs/xfs/xfs_ialloc.h6
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)909
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c720
-rw-r--r--fs/xfs/xfs_inode.c580
-rw-r--r--fs/xfs/xfs_inode.h25
-rw-r--r--fs/xfs/xfs_ioctl.c93
-rw-r--r--fs/xfs/xfs_ioctl32.c12
-rw-r--r--fs/xfs/xfs_iomap.c54
-rw-r--r--fs/xfs/xfs_iops.c59
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c483
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h20
-rw-r--r--fs/xfs/xfs_log_recover.c280
-rw-r--r--fs/xfs/xfs_mount.c201
-rw-r--r--fs/xfs/xfs_mount.h24
-rw-r--r--fs/xfs/xfs_qm.c24
-rw-r--r--fs/xfs/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/xfs_quotaops.c12
-rw-r--r--fs/xfs/xfs_rtalloc.c18
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c335
-rw-r--r--fs/xfs/xfs_super.h3
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h64
-rw-r--r--fs/xfs/xfs_trans.c17
-rw-r--r--fs/xfs/xfs_trans.h49
-rw-r--r--fs/xfs/xfs_trans_ail.c35
-rw-r--r--fs/xfs/xfs_trans_buf.c98
-rw-r--r--fs/xfs/xfs_trans_dquot.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h1
-rw-r--r--fs/xfs/xfs_types.h14
-rw-r--r--fs/xfs/xfs_utils.c17
-rw-r--r--fs/xfs/xfs_vnodeops.c449
-rw-r--r--fs/xfs/xfs_vnodeops.h9
839 files changed, 83282 insertions, 30181 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 9a1d4263075..15b67916620 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -37,7 +37,7 @@ static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
return ERR_PTR(-ENOMEM);
size = v9fs_fid_xattr_get(fid, name, value, size);
if (size > 0) {
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
goto err_out;
}
@@ -131,7 +131,7 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
buffer = kmalloc(size, GFP_KERNEL);
if (!buffer)
return -ENOMEM;
- retval = posix_acl_to_xattr(acl, buffer, size);
+ retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
if (retval < 0)
goto err_free_out;
switch (type) {
@@ -251,7 +251,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
return PTR_ERR(acl);
if (acl == NULL)
return -ENODATA;
- error = posix_acl_to_xattr(acl, buffer, size);
+ error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
posix_acl_release(acl);
return error;
@@ -304,7 +304,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
return -EPERM;
if (value) {
/* update the cached acl value */
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index b85efa77394..d934f04e773 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -184,10 +184,20 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
v9ses->afid = option;
break;
case Opt_uname:
- match_strlcpy(v9ses->uname, &args[0], PATH_MAX);
+ kfree(v9ses->uname);
+ v9ses->uname = match_strdup(&args[0]);
+ if (!v9ses->uname) {
+ ret = -ENOMEM;
+ goto free_and_return;
+ }
break;
case Opt_remotename:
- match_strlcpy(v9ses->aname, &args[0], PATH_MAX);
+ kfree(v9ses->aname);
+ v9ses->aname = match_strdup(&args[0]);
+ if (!v9ses->aname) {
+ ret = -ENOMEM;
+ goto free_and_return;
+ }
break;
case Opt_nodevmap:
v9ses->nodev = 1;
@@ -287,21 +297,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
struct p9_fid *fid;
int rc;
- v9ses->uname = __getname();
+ v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
if (!v9ses->uname)
return ERR_PTR(-ENOMEM);
- v9ses->aname = __getname();
+ v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
if (!v9ses->aname) {
- __putname(v9ses->uname);
+ kfree(v9ses->uname);
return ERR_PTR(-ENOMEM);
}
init_rwsem(&v9ses->rename_sem);
rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
if (rc) {
- __putname(v9ses->aname);
- __putname(v9ses->uname);
+ kfree(v9ses->aname);
+ kfree(v9ses->uname);
return ERR_PTR(rc);
}
@@ -309,8 +319,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
list_add(&v9ses->slist, &v9fs_sessionlist);
spin_unlock(&v9fs_sessionlist_lock);
- strcpy(v9ses->uname, V9FS_DEFUSER);
- strcpy(v9ses->aname, V9FS_DEFANAME);
v9ses->uid = ~0;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -412,8 +420,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
kfree(v9ses->cachetag);
}
#endif
- __putname(v9ses->uname);
- __putname(v9ses->aname);
+ kfree(v9ses->uname);
+ kfree(v9ses->aname);
bdi_destroy(&v9ses->bdi);
@@ -560,6 +568,11 @@ static int v9fs_init_inode_cache(void)
*/
static void v9fs_destroy_inode_cache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(v9fs_inode_cache);
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index e78956cbd70..34c59f14a1c 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -144,7 +144,7 @@ extern void v9fs_session_close(struct v9fs_session_info *v9ses);
extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nameidata);
+ unsigned int flags);
extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d529437ff44..64600b5d052 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -100,13 +100,13 @@ static void v9fs_dentry_release(struct dentry *dentry)
}
}
-static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
{
struct p9_fid *fid;
struct inode *inode;
struct v9fs_inode *v9inode;
- if (nd->flags & LOOKUP_RCU)
+ if (flags & LOOKUP_RCU)
return -ECHILD;
inode = dentry->d_inode;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index fc06fd27065..c2483e97bee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
page, (unsigned long)filp->private_data);
+ /* Update file times before taking page lock */
+ file_update_time(filp);
+
v9inode = V9FS_I(inode);
/* make sure the cache has finished storing the page */
v9fs_fscache_wait_on_page_write(inode, page);
@@ -735,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
static const struct vm_operations_struct v9fs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = v9fs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 57ccb7537da..890bed538f9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -712,88 +712,34 @@ error:
}
/**
- * v9fs_vfs_create - VFS hook to create files
+ * v9fs_vfs_create - VFS hook to create a regular file
+ *
+ * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called
+ * for mknod(2).
+ *
* @dir: directory inode that is being created
* @dentry: dentry that is being deleted
* @mode: create permissions
- * @nd: path information
*
*/
static int
v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
- int err;
- u32 perm;
- int flags;
- struct file *filp;
- struct v9fs_inode *v9inode;
- struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *inode_fid;
-
- err = 0;
- fid = NULL;
- v9ses = v9fs_inode2v9ses(dir);
- perm = unixmode2p9mode(v9ses, mode);
- if (nd)
- flags = nd->intent.open.flags;
- else
- flags = O_RDWR;
+ struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+ u32 perm = unixmode2p9mode(v9ses, mode);
+ struct p9_fid *fid;
- fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags,
- v9fs_proto_dotu(v9ses)));
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- fid = NULL;
- goto error;
- }
+ /* P9_OEXCL? */
+ fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_ORDWR);
+ if (IS_ERR(fid))
+ return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- /* if we are opening a file, assign the open fid to the file */
- if (nd) {
- v9inode = V9FS_I(dentry->d_inode);
- mutex_lock(&v9inode->v_mutex);
- if (v9ses->cache && !v9inode->writeback_fid &&
- ((flags & O_ACCMODE) != O_RDONLY)) {
- /*
- * clone a fid and add it to writeback_fid
- * we do it during open time instead of
- * page dirty time via write_begin/page_mkwrite
- * because we want write after unlink usecase
- * to work.
- */
- inode_fid = v9fs_writeback_fid(dentry);
- if (IS_ERR(inode_fid)) {
- err = PTR_ERR(inode_fid);
- mutex_unlock(&v9inode->v_mutex);
- goto error;
- }
- v9inode->writeback_fid = (void *) inode_fid;
- }
- mutex_unlock(&v9inode->v_mutex);
- filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
- if (IS_ERR(filp)) {
- err = PTR_ERR(filp);
- goto error;
- }
-
- filp->private_data = fid;
-#ifdef CONFIG_9P_FSCACHE
- if (v9ses->cache)
- v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
-#endif
- } else
- p9_client_clunk(fid);
+ p9_client_clunk(fid);
return 0;
-
-error:
- if (fid)
- p9_client_clunk(fid);
-
- return err;
}
/**
@@ -839,7 +785,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
*/
struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nameidata)
+ unsigned int flags)
{
struct dentry *res;
struct super_block *sb;
@@ -849,8 +795,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
char *name;
int result = 0;
- p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
- dir, dentry->d_name.name, dentry, nameidata);
+ p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
+ dir, dentry->d_name.name, dentry, flags);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -910,6 +856,86 @@ error:
return ERR_PTR(result);
}
+static int
+v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags, umode_t mode,
+ int *opened)
+{
+ int err;
+ u32 perm;
+ struct v9fs_inode *v9inode;
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid, *inode_fid;
+ struct dentry *res = NULL;
+
+ if (d_unhashed(dentry)) {
+ res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ if (res)
+ dentry = res;
+ }
+
+ /* Only creates */
+ if (!(flags & O_CREAT) || dentry->d_inode)
+ return finish_no_open(file, res);
+
+ err = 0;
+ fid = NULL;
+ v9ses = v9fs_inode2v9ses(dir);
+ perm = unixmode2p9mode(v9ses, mode);
+ fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
+ v9fs_uflags2omode(flags,
+ v9fs_proto_dotu(v9ses)));
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ fid = NULL;
+ goto error;
+ }
+
+ v9fs_invalidate_inode_attr(dir);
+ v9inode = V9FS_I(dentry->d_inode);
+ mutex_lock(&v9inode->v_mutex);
+ if (v9ses->cache && !v9inode->writeback_fid &&
+ ((flags & O_ACCMODE) != O_RDONLY)) {
+ /*
+ * clone a fid and add it to writeback_fid
+ * we do it during open time instead of
+ * page dirty time via write_begin/page_mkwrite
+ * because we want write after unlink usecase
+ * to work.
+ */
+ inode_fid = v9fs_writeback_fid(dentry);
+ if (IS_ERR(inode_fid)) {
+ err = PTR_ERR(inode_fid);
+ mutex_unlock(&v9inode->v_mutex);
+ goto error;
+ }
+ v9inode->writeback_fid = (void *) inode_fid;
+ }
+ mutex_unlock(&v9inode->v_mutex);
+ err = finish_open(file, dentry, generic_file_open, opened);
+ if (err)
+ goto error;
+
+ file->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cache)
+ v9fs_cache_inode_set_cookie(dentry->d_inode, file);
+#endif
+
+ *opened |= FILE_CREATED;
+out:
+ dput(res);
+ return err;
+
+error:
+ if (fid)
+ p9_client_clunk(fid);
+ goto out;
+}
+
/**
* v9fs_vfs_unlink - VFS unlink hook to delete an inode
* @i: inode that is being unlinked
@@ -1250,12 +1276,12 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
}
/* copy extension buffer into buffer */
- strncpy(buffer, st->extension, buflen);
+ retval = min(strlen(st->extension)+1, (size_t)buflen);
+ memcpy(buffer, st->extension, retval);
- p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
- dentry->d_name.name, st->extension, buffer);
+ p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n",
+ dentry->d_name.name, st->extension, buflen, buffer);
- retval = strnlen(buffer, buflen);
done:
p9stat_free(st);
kfree(st);
@@ -1488,6 +1514,7 @@ out:
static const struct inode_operations v9fs_dir_inode_operations_dotu = {
.create = v9fs_vfs_create,
.lookup = v9fs_vfs_lookup,
+ .atomic_open = v9fs_vfs_atomic_open,
.symlink = v9fs_vfs_symlink,
.link = v9fs_vfs_link,
.unlink = v9fs_vfs_unlink,
@@ -1502,6 +1529,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
static const struct inode_operations v9fs_dir_inode_operations = {
.create = v9fs_vfs_create,
.lookup = v9fs_vfs_lookup,
+ .atomic_open = v9fs_vfs_atomic_open,
.unlink = v9fs_vfs_unlink,
.mkdir = v9fs_vfs_mkdir,
.rmdir = v9fs_vfs_rmdir,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e3dd2a1e2bf..40895546e10 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -230,20 +230,25 @@ int v9fs_open_to_dotl_flags(int flags)
* @dir: directory inode that is being created
* @dentry: dentry that is being deleted
* @mode: create permissions
- * @nd: path information
*
*/
static int
v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
- struct nameidata *nd)
+ bool excl)
+{
+ return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+}
+
+static int
+v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags, umode_t omode,
+ int *opened)
{
int err = 0;
gid_t gid;
- int flags;
umode_t mode;
char *name = NULL;
- struct file *filp;
struct p9_qid qid;
struct inode *inode;
struct p9_fid *fid = NULL;
@@ -251,19 +256,23 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
struct p9_fid *dfid, *ofid, *inode_fid;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
+ struct dentry *res = NULL;
- v9ses = v9fs_inode2v9ses(dir);
- if (nd)
- flags = nd->intent.open.flags;
- else {
- /*
- * create call without LOOKUP_OPEN is due
- * to mknod of regular files. So use mknod
- * operation.
- */
- return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+ if (d_unhashed(dentry)) {
+ res = v9fs_vfs_lookup(dir, dentry, 0);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ if (res)
+ dentry = res;
}
+ /* Only creates */
+ if (!(flags & O_CREAT) || dentry->d_inode)
+ return finish_no_open(file, res);
+
+ v9ses = v9fs_inode2v9ses(dir);
+
name = (char *) dentry->d_name.name;
p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
name, flags, omode);
@@ -272,7 +281,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- return err;
+ goto out;
}
/* clone a fid to use for creation */
@@ -280,7 +289,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- return err;
+ goto out;
}
gid = v9fs_get_fsgid_for_create(dir);
@@ -345,17 +354,18 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
}
mutex_unlock(&v9inode->v_mutex);
/* Since we are opening a file, assign the open fid to the file */
- filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
- if (IS_ERR(filp)) {
- err = PTR_ERR(filp);
+ err = finish_open(file, dentry, generic_file_open, opened);
+ if (err)
goto err_clunk_old_fid;
- }
- filp->private_data = ofid;
+ file->private_data = ofid;
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache)
- v9fs_cache_inode_set_cookie(inode, filp);
+ v9fs_cache_inode_set_cookie(inode, file);
#endif
- return 0;
+ *opened |= FILE_CREATED;
+out:
+ dput(res);
+ return err;
error:
if (fid)
@@ -364,7 +374,7 @@ err_clunk_old_fid:
if (ofid)
p9_client_clunk(ofid);
v9fs_set_create_acl(NULL, &dacl, &pacl);
- return err;
+ goto out;
}
/**
@@ -982,6 +992,7 @@ out:
const struct inode_operations v9fs_dir_inode_operations_dotl = {
.create = v9fs_vfs_create_dotl,
+ .atomic_open = v9fs_vfs_atomic_open_dotl,
.lookup = v9fs_vfs_lookup,
.link = v9fs_vfs_link_dotl,
.symlink = v9fs_vfs_symlink_dotl,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8c92a9ba833..137d5039689 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -89,7 +89,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
- sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+ sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
if (!v9ses->cache)
sb->s_flags |= MS_SYNCHRONOUS;
@@ -137,7 +137,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto close_session;
}
- sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+ sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
if (IS_ERR(sb)) {
retval = PTR_ERR(sb);
goto clunk_fid;
diff --git a/fs/Kconfig b/fs/Kconfig
index f95ae3a027f..780725a463b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,8 +28,8 @@ config FS_MBCACHE
tristate
default y if EXT2_FS=y && EXT2_FS_XATTR
default y if EXT3_FS=y && EXT3_FS_XATTR
- default y if EXT4_FS=y && EXT4_FS_XATTR
- default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+ default y if EXT4_FS=y
+ default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
@@ -68,16 +68,6 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
-config CUSE
- tristate "Character device in Userspace support"
- depends on FUSE_FS
- help
- This FUSE extension allows character devices to be
- implemented in userspace.
-
- If you want to develop or use userspace character device
- based on CUSE, answer Y or M.
-
config GENERIC_ACL
bool
select FS_POSIX_ACL
@@ -220,6 +210,7 @@ source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
+source "fs/f2fs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 02257420274..0efd1524b97 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -164,3 +164,11 @@ config BINFMT_MISC
You may say M here for module support and later load the module when
you have use for it; the module is called binfmt_misc. If you
don't know what to answer at this point, say Y.
+
+config COREDUMP
+ bool "Enable core dump support" if EXPERT
+ default y
+ help
+ This option enables support for performing core dumps. You almost
+ certainly want to say Y here. Not necessary on systems that never
+ need debugging or only ever run flawless code.
diff --git a/fs/Makefile b/fs/Makefile
index 2fb97793467..9d53192236f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
+obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_FHANDLE) += fhandle.o
@@ -122,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_F2FS_FS) += f2fs/
obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 718ac1f440c..585adafb0cc 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -46,8 +46,8 @@ struct adfs_sb_info {
struct adfs_discmap *s_map; /* bh list containing map */
struct adfs_dir_ops *s_dir; /* directory operations */
- uid_t s_uid; /* owner uid */
- gid_t s_gid; /* owner gid */
+ kuid_t s_uid; /* owner uid */
+ kgid_t s_gid; /* owner gid */
umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
umode_t s_other_mask; /* ADFS other perm -> unix perm */
int s_ftsuffix; /* ,xyz hex filetype suffix option */
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3d83075aaa2..b3be2e7c564 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -266,7 +266,7 @@ const struct dentry_operations adfs_dentry_operations = {
};
static struct dentry *
-adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode = NULL;
struct object_info obj;
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 1dab6a174d6..5f95d1ed9c6 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, adfs_get_block);
}
+static void adfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int adfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ adfs_write_failed(mapping, pos + len);
return ret;
}
@@ -304,8 +309,8 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
* we can't change the UID or GID of any file -
* we have a global UID/GID in the superblock
*/
- if ((ia_valid & ATTR_UID && attr->ia_uid != ADFS_SB(sb)->s_uid) ||
- (ia_valid & ATTR_GID && attr->ia_gid != ADFS_SB(sb)->s_gid))
+ if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, ADFS_SB(sb)->s_uid)) ||
+ (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, ADFS_SB(sb)->s_gid)))
error = -EPERM;
if (error)
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 06fdcc9382c..d5712293579 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/statfs.h>
+#include <linux/user_namespace.h>
#include "adfs.h"
#include "dir_f.h"
#include "dir_fplus.h"
@@ -130,10 +131,10 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root)
{
struct adfs_sb_info *asb = ADFS_SB(root->d_sb);
- if (asb->s_uid != 0)
- seq_printf(seq, ",uid=%u", asb->s_uid);
- if (asb->s_gid != 0)
- seq_printf(seq, ",gid=%u", asb->s_gid);
+ if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID))
+ seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid));
+ if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid));
if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK)
seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
@@ -175,12 +176,16 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_uid:
if (match_int(args, &option))
return -EINVAL;
- asb->s_uid = option;
+ asb->s_uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(asb->s_uid))
+ return -EINVAL;
break;
case Opt_gid:
if (match_int(args, &option))
return -EINVAL;
- asb->s_gid = option;
+ asb->s_gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(asb->s_gid))
+ return -EINVAL;
break;
case Opt_ownmask:
if (match_octal(args, &option))
@@ -246,7 +251,6 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
static void adfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
}
@@ -276,6 +280,11 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(adfs_inode_cachep);
}
@@ -370,8 +379,8 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = asb;
/* set default options */
- asb->s_uid = 0;
- asb->s_gid = 0;
+ asb->s_uid = GLOBAL_ROOT_UID;
+ asb->s_gid = GLOBAL_ROOT_GID;
asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
asb->s_ftsuffix = 0;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1fceb320d2f..3952121f2f2 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -3,6 +3,7 @@
#include <linux/buffer_head.h>
#include <linux/amigaffs.h>
#include <linux/mutex.h>
+#include <linux/workqueue.h>
/* AmigaOS allows file names with up to 30 characters length.
* Names longer than that will be silently truncated. If you
@@ -87,8 +88,8 @@ struct affs_sb_info {
u32 s_root_block; /* FFS root block number. */
int s_hashsize; /* Size of hash table. */
unsigned long s_flags; /* See below. */
- uid_t s_uid; /* uid to override */
- gid_t s_gid; /* gid to override */
+ kuid_t s_uid; /* uid to override */
+ kgid_t s_gid; /* gid to override */
umode_t s_mode; /* mode to override */
struct buffer_head *s_root_bh; /* Cached root block. */
struct mutex s_bmlock; /* Protects bitmap access. */
@@ -100,6 +101,10 @@ struct affs_sb_info {
char *s_prefix; /* Prefix for volumes and assigns. */
char s_volume[32]; /* Volume prefix for absolute symlinks. */
spinlock_t symlink_lock; /* protects the previous two */
+ struct super_block *sb; /* the VFS superblock object */
+ int work_queued; /* non-zero delayed work is queued */
+ struct delayed_work sb_work; /* superblock flush delayed work */
+ spinlock_t work_lock; /* protects sb_work and work_queued */
};
#define SF_INTL 0x0001 /* International filesystem. */
@@ -120,6 +125,8 @@ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
return sb->s_fs_info;
}
+void affs_mark_sb_dirty(struct super_block *sb);
+
/* amigaffs.c */
extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh);
@@ -146,9 +153,9 @@ extern void affs_free_bitmap(struct super_block *sb);
/* namei.c */
extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
-extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
+extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
extern int affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *);
+extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool);
extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
extern int affs_rmdir(struct inode *dir, struct dentry *dentry);
extern int affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 52a6407682e..eb82ee53ee0 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -122,22 +122,16 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
}
static void
-affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
+affs_fix_dcache(struct inode *inode, u32 entry_ino)
{
- struct inode *inode = dentry->d_inode;
- void *data = dentry->d_fsdata;
- struct list_head *head, *next;
-
+ struct dentry *dentry;
+ struct hlist_node *p;
spin_lock(&inode->i_lock);
- head = &inode->i_dentry;
- next = head->next;
- while (next != head) {
- dentry = list_entry(next, struct dentry, d_alias);
+ hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
if (entry_ino == (u32)(long)dentry->d_fsdata) {
- dentry->d_fsdata = data;
+ dentry->d_fsdata = (void *)inode->i_ino;
break;
}
- next = next->next;
}
spin_unlock(&inode->i_lock);
}
@@ -177,7 +171,11 @@ affs_remove_link(struct dentry *dentry)
}
affs_lock_dir(dir);
- affs_fix_dcache(dentry, link_ino);
+ /*
+ * if there's a dentry for that block, make it
+ * refer to inode itself.
+ */
+ affs_fix_dcache(inode, link_ino);
retval = affs_remove_hash(dir, link_bh);
if (retval) {
affs_unlock_dir(dir);
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 3e262711ae0..a32246b8359 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -10,30 +10,6 @@
#include <linux/slab.h>
#include "affs.h"
-/* This is, of course, shamelessly stolen from fs/minix */
-
-static const int nibblemap[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
-
-static u32
-affs_count_free_bits(u32 blocksize, const void *data)
-{
- const u32 *map;
- u32 free;
- u32 tmp;
-
- map = data;
- free = 0;
- for (blocksize /= 4; blocksize > 0; blocksize--) {
- tmp = *map++;
- while (tmp) {
- free += nibblemap[tmp & 0xf];
- tmp >>= 4;
- }
- }
-
- return free;
-}
-
u32
affs_count_free_blocks(struct super_block *sb)
{
@@ -103,7 +79,7 @@ affs_free_block(struct super_block *sb, u32 block)
*(__be32 *)bh->b_data = cpu_to_be32(tmp - mask);
mark_buffer_dirty(bh);
- sb->s_dirt = 1;
+ affs_mark_sb_dirty(sb);
bm->bm_free++;
mutex_unlock(&sbi->s_bmlock);
@@ -248,7 +224,7 @@ find_bit:
*(__be32 *)bh->b_data = cpu_to_be32(tmp + mask);
mark_buffer_dirty(bh);
- sb->s_dirt = 1;
+ affs_mark_sb_dirty(sb);
mutex_unlock(&sbi->s_bmlock);
@@ -317,7 +293,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
goto out;
}
pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key);
- bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4);
+ bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
/* Don't try read the extension if this is the last block,
* but we also need the right bm pointer below
@@ -367,7 +343,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
/* recalculate bitmap count for last block */
bm--;
- bm->bm_free = affs_count_free_bits(sb->s_blocksize - 4, bh->b_data + 4);
+ bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
out:
affs_brelse(bh);
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 2f4c935cb32..af3261b7810 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {
};
const struct inode_operations affs_file_inode_operations = {
- .truncate = affs_truncate,
.setattr = affs_notify_change,
};
@@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, affs_get_block);
}
+static void affs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ affs_truncate(inode);
+ }
+}
+
static int affs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ affs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 8bc4a59f4e7..0e092d08680 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -80,17 +80,17 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
if (id == 0 || sbi->s_flags & SF_SETUID)
inode->i_uid = sbi->s_uid;
else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
- inode->i_uid = 0;
+ i_uid_write(inode, 0);
else
- inode->i_uid = id;
+ i_uid_write(inode, id);
id = be16_to_cpu(tail->gid);
if (id == 0 || sbi->s_flags & SF_SETGID)
inode->i_gid = sbi->s_gid;
else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
- inode->i_gid = 0;
+ i_gid_write(inode, 0);
else
- inode->i_gid = id;
+ i_gid_write(inode, id);
switch (be32_to_cpu(tail->stype)) {
case ST_ROOT:
@@ -193,13 +193,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
tail->size = cpu_to_be32(inode->i_size);
secs_to_datestamp(inode->i_mtime.tv_sec,&tail->change);
if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
- uid = inode->i_uid;
- gid = inode->i_gid;
+ uid = i_uid_read(inode);
+ gid = i_gid_read(inode);
if (AFFS_SB(sb)->s_flags & SF_MUFS) {
- if (inode->i_uid == 0 || inode->i_uid == 0xFFFF)
- uid = inode->i_uid ^ ~0;
- if (inode->i_gid == 0 || inode->i_gid == 0xFFFF)
- gid = inode->i_gid ^ ~0;
+ if (uid == 0 || uid == 0xFFFF)
+ uid = uid ^ ~0;
+ if (gid == 0 || gid == 0xFFFF)
+ gid = gid ^ ~0;
}
if (!(AFFS_SB(sb)->s_flags & SF_SETUID))
tail->uid = cpu_to_be16(uid);
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ affs_truncate(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 47806940aac..ff65884a783 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -211,7 +211,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
}
struct dentry *
-affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct super_block *sb = dir->i_sb;
struct buffer_head *bh;
@@ -255,7 +255,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry)
}
int
-affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
+affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0782653a05a..b84dc735250 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
#include <linux/magic.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/writeback.h>
#include "affs.h"
extern struct timezone sys_tz;
@@ -25,15 +26,17 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int affs_remount (struct super_block *sb, int *flags, char *data);
static void
-affs_commit_super(struct super_block *sb, int wait, int clean)
+affs_commit_super(struct super_block *sb, int wait)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
struct buffer_head *bh = sbi->s_root_bh;
struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
- tail->bm_flag = cpu_to_be32(clean);
+ lock_buffer(bh);
secs_to_datestamp(get_seconds(), &tail->disk_change);
affs_fix_checksum(sb, bh);
+ unlock_buffer(bh);
+
mark_buffer_dirty(bh);
if (wait)
sync_dirty_buffer(bh);
@@ -45,9 +48,7 @@ affs_put_super(struct super_block *sb)
struct affs_sb_info *sbi = AFFS_SB(sb);
pr_debug("AFFS: put_super()\n");
- if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
- affs_commit_super(sb, 1, 1);
-
+ cancel_delayed_work_sync(&sbi->sb_work);
kfree(sbi->s_prefix);
affs_free_bitmap(sb);
affs_brelse(sbi->s_root_bh);
@@ -55,26 +56,43 @@ affs_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
-static void
-affs_write_super(struct super_block *sb)
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+ affs_commit_super(sb, wait);
+ return 0;
+}
+
+static void flush_superblock(struct work_struct *work)
{
- lock_super(sb);
- if (!(sb->s_flags & MS_RDONLY))
- affs_commit_super(sb, 1, 2);
- sb->s_dirt = 0;
- unlock_super(sb);
+ struct affs_sb_info *sbi;
+ struct super_block *sb;
- pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
+ sbi = container_of(work, struct affs_sb_info, sb_work.work);
+ sb = sbi->sb;
+
+ spin_lock(&sbi->work_lock);
+ sbi->work_queued = 0;
+ spin_unlock(&sbi->work_lock);
+
+ affs_commit_super(sb, 1);
}
-static int
-affs_sync_fs(struct super_block *sb, int wait)
+void affs_mark_sb_dirty(struct super_block *sb)
{
- lock_super(sb);
- affs_commit_super(sb, wait, 2);
- sb->s_dirt = 0;
- unlock_super(sb);
- return 0;
+ struct affs_sb_info *sbi = AFFS_SB(sb);
+ unsigned long delay;
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ spin_lock(&sbi->work_lock);
+ if (!sbi->work_queued) {
+ delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+ queue_delayed_work(system_long_wq, &sbi->sb_work, delay);
+ sbi->work_queued = 1;
+ }
+ spin_unlock(&sbi->work_lock);
}
static struct kmem_cache * affs_inode_cachep;
@@ -129,6 +147,11 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(affs_inode_cachep);
}
@@ -138,7 +161,6 @@ static const struct super_operations affs_sops = {
.write_inode = affs_write_inode,
.evict_inode = affs_evict_inode,
.put_super = affs_put_super,
- .write_super = affs_write_super,
.sync_fs = affs_sync_fs,
.statfs = affs_statfs,
.remount_fs = affs_remount,
@@ -171,7 +193,7 @@ static const match_table_t tokens = {
};
static int
-parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s32 *root,
+parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
{
char *p;
@@ -236,13 +258,17 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
case Opt_setgid:
if (match_int(&args[0], &option))
return 0;
- *gid = option;
+ *gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(*gid))
+ return 0;
*mount_opts |= SF_SETGID;
break;
case Opt_setuid:
if (match_int(&args[0], &option))
return 0;
- *uid = option;
+ *uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(*uid))
+ return 0;
*mount_opts |= SF_SETUID;
break;
case Opt_verbose:
@@ -284,8 +310,8 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
int num_bm;
int i, j;
s32 key;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
int reserved;
unsigned long mount_flags;
int tmp_flags; /* fix remount prototype... */
@@ -305,8 +331,11 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sbi->sb = sb;
mutex_init(&sbi->s_bmlock);
spin_lock_init(&sbi->symlink_lock);
+ spin_lock_init(&sbi->work_lock);
+ INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
&blocksize,&sbi->s_prefix,
@@ -507,8 +536,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
int blocksize;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
int mode;
int reserved;
int root_block;
@@ -531,6 +560,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
return -EINVAL;
}
+ flush_delayed_work(&sbi->sb_work);
replace_mount_options(sb, new_opts);
sbi->s_flags = mount_flags;
@@ -549,10 +579,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
- if (*flags & MS_RDONLY) {
- affs_write_super(sb);
+ if (*flags & MS_RDONLY)
affs_free_bitmap(sb);
- } else
+ else
res = affs_init_bitmap(sb, flags);
return res;
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 587ef5123cd..7ef637d7f3a 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -351,9 +351,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work)
*/
void afs_flush_callback_breaks(struct afs_server *server)
{
- cancel_delayed_work(&server->cb_break_work);
- queue_delayed_work(afs_callback_update_worker,
- &server->cb_break_work, 0);
+ mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
}
#if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index e22dc4b4a50..db477906ba4 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -20,16 +20,16 @@
#include "internal.h"
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd);
+ unsigned int flags);
static int afs_dir_open(struct inode *inode, struct file *file);
static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
-static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
+static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_release(struct dentry *dentry);
static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd);
+ bool excl);
static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
static int afs_rmdir(struct inode *dir, struct dentry *dentry);
static int afs_unlink(struct inode *dir, struct dentry *dentry);
@@ -516,7 +516,7 @@ out:
* look up an entry in a directory
*/
static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct afs_vnode *vnode;
struct afs_fid fid;
@@ -598,7 +598,7 @@ success:
* - NOTE! the hit can be a negative hit too, so we can't assume we have an
* inode
*/
-static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct afs_vnode *vnode, *dir;
struct afs_fid uninitialized_var(fid);
@@ -607,7 +607,7 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
void *dir_version;
int ret;
- if (nd->flags & LOOKUP_RCU)
+ if (flags & LOOKUP_RCU)
return -ECHILD;
vnode = AFS_FS_I(dentry->d_inode);
@@ -949,7 +949,7 @@ error:
* create a regular file on an AFS filesystem
*/
static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
struct afs_file_status status;
struct afs_callback cb;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 298cf8919ec..9682c33d5da 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -22,7 +22,7 @@
static struct dentry *afs_mntpt_lookup(struct inode *dir,
struct dentry *dentry,
- struct nameidata *nd);
+ unsigned int flags);
static int afs_mntpt_open(struct inode *inode, struct file *file);
static void afs_mntpt_expiry_timed_out(struct work_struct *work);
@@ -104,7 +104,7 @@ out:
*/
static struct dentry *afs_mntpt_lookup(struct inode *dir,
struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
_enter("%p,%p{%p{%s},%s}",
dir,
diff --git a/fs/afs/server.c b/fs/afs/server.c
index d59b7516e94..f342acf3547 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -285,12 +285,7 @@ static void afs_reap_server(struct work_struct *work)
expiry = server->time_of_death + afs_server_timeout;
if (expiry > now) {
delay = (expiry - now) * HZ;
- if (!queue_delayed_work(afs_wq, &afs_server_reaper,
- delay)) {
- cancel_delayed_work(&afs_server_reaper);
- queue_delayed_work(afs_wq, &afs_server_reaper,
- delay);
- }
+ mod_delayed_work(afs_wq, &afs_server_reaper, delay);
break;
}
@@ -323,6 +318,5 @@ static void afs_reap_server(struct work_struct *work)
void __exit afs_purge_servers(void)
{
afs_server_timeout = 0;
- cancel_delayed_work(&afs_server_reaper);
- queue_delayed_work(afs_wq, &afs_server_reaper, 0);
+ mod_delayed_work(afs_wq, &afs_server_reaper, 0);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f02b31e7e64..43165009428 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -123,6 +123,11 @@ void __exit afs_fs_exit(void)
BUG();
}
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(afs_inode_cachep);
_leave("");
}
@@ -395,7 +400,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
as->volume = vol;
/* allocate a deviceless superblock */
- sb = sget(fs_type, afs_test_super, afs_set_super, as);
+ sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
afs_put_volume(vol);
@@ -406,7 +411,6 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
if (!sb->s_root) {
/* initial superblock/root creation */
_debug("create");
- sb->s_flags = flags;
ret = afs_fill_super(sb, &params);
if (ret < 0) {
deactivate_locked_super(sb);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 431984d2e37..57bcb159653 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -561,12 +561,7 @@ static void afs_vlocation_reaper(struct work_struct *work)
if (expiry > now) {
delay = (expiry - now) * HZ;
_debug("delay %lu", delay);
- if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
- delay)) {
- cancel_delayed_work(&afs_vlocation_reap);
- queue_delayed_work(afs_wq, &afs_vlocation_reap,
- delay);
- }
+ mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
break;
}
@@ -614,13 +609,10 @@ void afs_vlocation_purge(void)
spin_lock(&afs_vlocation_updates_lock);
list_del_init(&afs_vlocation_updates);
spin_unlock(&afs_vlocation_updates_lock);
- cancel_delayed_work(&afs_vlocation_update);
- queue_delayed_work(afs_vlocation_update_worker,
- &afs_vlocation_update, 0);
+ mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
destroy_workqueue(afs_vlocation_update_worker);
- cancel_delayed_work(&afs_vlocation_reap);
- queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
+ mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
}
/*
diff --git a/fs/aio.c b/fs/aio.c
index 55c4c765605..71f613cf4a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -56,13 +56,6 @@ static struct kmem_cache *kioctx_cachep;
static struct workqueue_struct *aio_wq;
-/* Used for rare fput completion. */
-static void aio_fput_routine(struct work_struct *);
-static DECLARE_WORK(fput_work, aio_fput_routine);
-
-static DEFINE_SPINLOCK(fput_lock);
-static LIST_HEAD(fput_head);
-
static void aio_kick_handler(struct work_struct *);
static void aio_queue_work(struct kioctx *);
@@ -479,7 +472,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
{
unsigned short allocated, to_alloc;
long avail;
- bool called_fput = false;
struct kiocb *req, *n;
struct aio_ring *ring;
@@ -495,28 +487,11 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
if (allocated == 0)
goto out;
-retry:
spin_lock_irq(&ctx->ctx_lock);
ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
BUG_ON(avail < 0);
- if (avail == 0 && !called_fput) {
- /*
- * Handle a potential starvation case. It is possible that
- * we hold the last reference on a struct file, causing us
- * to delay the final fput to non-irq context. In this case,
- * ctx->reqs_active is artificially high. Calling the fput
- * routine here may free up a slot in the event completion
- * ring, allowing this allocation to succeed.
- */
- kunmap_atomic(ring);
- spin_unlock_irq(&ctx->ctx_lock);
- aio_fput_routine(NULL);
- called_fput = true;
- goto retry;
- }
-
if (avail < allocated) {
/* Trim back the number of requests. */
list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
@@ -570,36 +545,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
wake_up_all(&ctx->wait);
}
-static void aio_fput_routine(struct work_struct *data)
-{
- spin_lock_irq(&fput_lock);
- while (likely(!list_empty(&fput_head))) {
- struct kiocb *req = list_kiocb(fput_head.next);
- struct kioctx *ctx = req->ki_ctx;
-
- list_del(&req->ki_list);
- spin_unlock_irq(&fput_lock);
-
- /* Complete the fput(s) */
- if (req->ki_filp != NULL)
- fput(req->ki_filp);
-
- /* Link the iocb into the context's free list */
- rcu_read_lock();
- spin_lock_irq(&ctx->ctx_lock);
- really_put_req(ctx, req);
- /*
- * at that point ctx might've been killed, but actual
- * freeing is RCU'd
- */
- spin_unlock_irq(&ctx->ctx_lock);
- rcu_read_unlock();
-
- spin_lock_irq(&fput_lock);
- }
- spin_unlock_irq(&fput_lock);
-}
-
/* __aio_put_req
* Returns true if this put was the last user of the request.
*/
@@ -618,21 +563,9 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
req->ki_cancel = NULL;
req->ki_retry = NULL;
- /*
- * Try to optimize the aio and eventfd file* puts, by avoiding to
- * schedule work in case it is not final fput() time. In normal cases,
- * we would not be holding the last reference to the file*, so
- * this function will be executed w/out any aio kthread wakeup.
- */
- if (unlikely(!fput_atomic(req->ki_filp))) {
- spin_lock(&fput_lock);
- list_add(&req->ki_list, &fput_head);
- spin_unlock(&fput_lock);
- schedule_work(&fput_work);
- } else {
- req->ki_filp = NULL;
- really_put_req(ctx, req);
- }
+ fput(req->ki_filp);
+ req->ki_filp = NULL;
+ really_put_req(ctx, req);
return 1;
}
diff --git a/fs/attr.c b/fs/attr.c
index 0da90951d27..1449adb14ef 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,6 +14,7 @@
#include <linux/fcntl.h>
#include <linux/security.h>
#include <linux/evm.h>
+#include <linux/ima.h>
/**
* inode_change_ok - check if attribute changes to an inode are allowed
@@ -48,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
/* Make sure a caller can chown. */
if ((ia_valid & ATTR_UID) &&
(!uid_eq(current_fsuid(), inode->i_uid) ||
- !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN))
+ !uid_eq(attr->ia_uid, inode->i_uid)) &&
+ !inode_capable(inode, CAP_CHOWN))
return -EPERM;
/* Make sure caller can chgrp. */
if ((ia_valid & ATTR_GID) &&
(!uid_eq(current_fsuid(), inode->i_uid) ||
(!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
- !capable(CAP_CHOWN))
+ !inode_capable(inode, CAP_CHOWN))
return -EPERM;
/* Make sure a caller can chmod. */
@@ -64,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
return -EPERM;
/* Also check the setgid bit! */
if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
- inode->i_gid) && !capable(CAP_FSETID))
+ inode->i_gid) &&
+ !inode_capable(inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -156,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ if (!in_group_p(inode->i_gid) &&
+ !inode_capable(inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
@@ -171,6 +175,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
struct timespec now;
unsigned int ia_valid = attr->ia_valid;
+ WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+
if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
@@ -245,10 +251,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
if (!error) {
fsnotify_change(dentry, ia_valid);
+ ima_inode_post_setattr(dentry);
evm_inode_post_setattr(dentry, ia_valid);
}
return error;
}
-
EXPORT_SYMBOL(notify_change);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 908e1845541..b785e770795 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_info {
unsigned long last_used;
atomic_t count;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
@@ -89,8 +89,8 @@ struct autofs_wait_queue {
struct qstr name;
u32 dev;
u64 ino;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
pid_t pid;
pid_t tgid;
/* This is for status reporting upon return */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index aa9103f8f01..9f68a37bb2b 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -221,20 +221,6 @@ static int test_by_type(struct path *path, void *p)
return ino && ino->sbi->type & *(unsigned *)p;
}
-static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
-{
- struct files_struct *files = current->files;
- struct fdtable *fdt;
-
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
- __set_close_on_exec(fd, fdt);
- spin_unlock(&files->file_lock);
-}
-
-
/*
* Open a file descriptor on the autofs mount point corresponding
* to the given path and device number (aka. new_encode_dev(sb->s_dev)).
@@ -243,7 +229,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
{
int err, fd;
- fd = get_unused_fd();
+ fd = get_unused_fd_flags(O_CLOEXEC);
if (likely(fd >= 0)) {
struct file *filp;
struct path path;
@@ -257,14 +243,14 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
* corresponding to the autofs fs we want to open.
*/
- filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
- current_cred());
+ filp = dentry_open(&path, O_RDONLY, current_cred());
+ path_put(&path);
if (IS_ERR(filp)) {
err = PTR_ERR(filp);
goto out;
}
- autofs_dev_ioctl_fd_install(fd, filp);
+ fd_install(fd, filp);
}
return fd;
@@ -451,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
err = 0;
autofs4_expire_wait(path.dentry);
spin_lock(&sbi->fs_lock);
- param->requester.uid = ino->uid;
- param->requester.gid = ino->gid;
+ param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
+ param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
spin_unlock(&sbi->fs_lock);
}
path_put(&path);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1feb68ecef9..01443ce43ee 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -94,25 +94,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev,
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
- struct dentry *p, *q;
+ struct dentry *q;
spin_lock(&sbi->lookup_lock);
+ spin_lock(&root->d_lock);
- if (prev == NULL) {
- spin_lock(&root->d_lock);
+ if (prev)
+ next = prev->d_u.d_child.next;
+ else {
prev = dget_dlock(root);
next = prev->d_subdirs.next;
- p = prev;
- goto start;
}
- p = prev;
- spin_lock(&p->d_lock);
-again:
- next = p->d_u.d_child.next;
-start:
+cont:
if (next == &root->d_subdirs) {
- spin_unlock(&p->d_lock);
+ spin_unlock(&root->d_lock);
spin_unlock(&sbi->lookup_lock);
dput(prev);
return NULL;
@@ -121,16 +117,15 @@ start:
q = list_entry(next, struct dentry, d_u.d_child);
spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
- /* Negative dentry - try next */
- if (!simple_positive(q)) {
- spin_unlock(&p->d_lock);
- lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
- p = q;
- goto again;
+ /* Already gone or negative dentry (under construction) - try next */
+ if (q->d_count == 0 || !simple_positive(q)) {
+ spin_unlock(&q->d_lock);
+ next = q->d_u.d_child.next;
+ goto cont;
}
dget_dlock(q);
spin_unlock(&q->d_lock);
- spin_unlock(&p->d_lock);
+ spin_unlock(&root->d_lock);
spin_unlock(&sbi->lookup_lock);
dput(prev);
@@ -404,11 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
DPRINTK("checking mountpoint %p %.*s",
dentry, (int)dentry->d_name.len, dentry->d_name.name);
- /* Path walk currently on this dentry? */
- ino_count = atomic_read(&ino->count) + 2;
- if (dentry->d_count > ino_count)
- goto next;
-
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
goto next;
@@ -558,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_EXPIRING;
- spin_lock(&dentry->d_lock);
- if (!ret) {
- if ((IS_ROOT(dentry) ||
- (autofs_type_indirect(sbi->type) &&
- IS_ROOT(dentry->d_parent))) &&
- !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
- __managed_dentry_set_automount(dentry);
- }
- spin_unlock(&dentry->d_lock);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8a4fed8ead3..b104726e2d0 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
void autofs4_clean_ino(struct autofs_info *ino)
{
- ino->uid = 0;
- ino->gid = 0;
+ ino->uid = GLOBAL_ROOT_UID;
+ ino->gid = GLOBAL_ROOT_GID;
ino->last_used = jiffies;
}
@@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
return 0;
seq_printf(m, ",fd=%d", sbi->pipefd);
- if (root_inode->i_uid != 0)
- seq_printf(m, ",uid=%u", root_inode->i_uid);
- if (root_inode->i_gid != 0)
- seq_printf(m, ",gid=%u", root_inode->i_gid);
+ if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, root_inode->i_uid));
+ if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, root_inode->i_gid));
seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
seq_printf(m, ",minproto=%d", sbi->min_proto);
@@ -126,7 +128,7 @@ static const match_table_t tokens = {
{Opt_err, NULL}
};
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
{
char *p;
@@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
case Opt_uid:
if (match_int(args, &option))
return 1;
- *uid = option;
+ *uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(*uid))
+ return 1;
break;
case Opt_gid:
if (match_int(args, &option))
return 1;
- *gid = option;
+ *gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(*gid))
+ return 1;
break;
case Opt_pgrp:
if (match_int(args, &option))
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 75e5f1c8e02..c93447604da 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
+static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
static int autofs4_d_manage(struct dentry *, bool);
static void autofs4_dentry_release(struct dentry *);
@@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
* it.
*/
spin_lock(&sbi->lookup_lock);
- spin_lock(&dentry->d_lock);
- if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
- spin_unlock(&dentry->d_lock);
+ if (!d_mountpoint(dentry) && simple_empty(dentry)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
- spin_unlock(&dentry->d_lock);
spin_unlock(&sbi->lookup_lock);
out:
@@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
status = autofs4_mount_wait(dentry);
if (status)
return ERR_PTR(status);
- spin_lock(&sbi->fs_lock);
goto done;
}
@@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* having d_mountpoint() true, so there's no need to call back
* to the daemon.
*/
- if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ spin_unlock(&sbi->fs_lock);
goto done;
+ }
+
if (!d_mountpoint(dentry)) {
/*
* It's possible that user space hasn't removed directories
@@ -379,46 +378,26 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* require user space behave.
*/
if (sbi->version > 4) {
- if (have_submounts(dentry))
+ if (have_submounts(dentry)) {
+ spin_unlock(&sbi->fs_lock);
goto done;
+ }
} else {
- spin_lock(&dentry->d_lock);
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&dentry->d_lock);
+ if (!simple_empty(dentry))
goto done;
- }
- spin_unlock(&dentry->d_lock);
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
status = autofs4_mount_wait(dentry);
- if (status)
- return ERR_PTR(status);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
- }
-done:
- if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
- /*
- * Any needed mounting has been completed and the path
- * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
- * call ->d_automount() on rootless multi-mounts since
- * it can lead to an incorrect ELOOP error return.
- *
- * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
- * symlinks as in all other cases the dentry will be covered by
- * an actual mount so ->d_automount() won't be called during
- * the follow.
- */
- spin_lock(&dentry->d_lock);
- if ((!d_mountpoint(dentry) &&
- !list_empty(&dentry->d_subdirs)) ||
- (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
- __managed_dentry_clear_automount(dentry);
- spin_unlock(&dentry->d_lock);
+ if (status) {
+ spin_unlock(&sbi->fs_lock);
+ return ERR_PTR(status);
+ }
}
spin_unlock(&sbi->fs_lock);
-
+done:
/* Mount succeeded, check if we ended up with a new dentry */
dentry = autofs4_mountpoint_changed(path);
if (!dentry)
@@ -430,6 +409,8 @@ done:
int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ int status;
DPRINTK("dentry=%p %.*s",
dentry, dentry->d_name.len, dentry->d_name.name);
@@ -454,11 +435,36 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* This dentry may be under construction so wait on mount
* completion.
*/
- return autofs4_mount_wait(dentry);
+ status = autofs4_mount_wait(dentry);
+ if (status)
+ return status;
+
+ spin_lock(&sbi->fs_lock);
+ /*
+ * If the dentry has been selected for expire while we slept
+ * on the lock then it might go away. We'll deal with that in
+ * ->d_automount() and wait on a new mount if the expire
+ * succeeds or return here if it doesn't (since there's no
+ * mount to follow with a rootless multi-mount).
+ */
+ if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+ /*
+ * Any needed mounting has been completed and the path
+ * updated so check if this is a rootless multi-mount so
+ * we can avoid needless calls ->d_automount() and avoid
+ * an incorrect ELOOP error return.
+ */
+ if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+ (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+ status = -EISDIR;
+ }
+ spin_unlock(&sbi->fs_lock);
+
+ return status;
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
@@ -597,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
spin_lock(&sbi->lookup_lock);
__autofs4_add_expiring(dentry);
- spin_lock(&dentry->d_lock);
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
+ d_drop(dentry);
spin_unlock(&sbi->lookup_lock);
return 0;
@@ -670,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return -EACCES;
spin_lock(&sbi->lookup_lock);
- spin_lock(&dentry->d_lock);
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&dentry->d_lock);
+ if (!simple_empty(dentry)) {
spin_unlock(&sbi->lookup_lock);
return -ENOTEMPTY;
}
__autofs4_add_expiring(dentry);
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
+ d_drop(dentry);
spin_unlock(&sbi->lookup_lock);
if (sbi->version < 5)
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index da8876d38a7..03bc1d347d8 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
case autofs_ptype_expire_direct:
{
struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+ struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
pktsz = sizeof(*packet);
@@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
packet->name[wq->name.len] = '\0';
packet->dev = wq->dev;
packet->ino = wq->ino;
- packet->uid = wq->uid;
- packet->gid = wq->gid;
+ packet->uid = from_kuid_munged(user_ns, wq->uid);
+ packet->gid = from_kgid_munged(user_ns, wq->gid);
packet->pid = wq->pid;
packet->tgid = wq->tgid;
break;
@@ -175,8 +176,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
return;
}
- pipe = sbi->pipe;
- get_file(pipe);
+ pipe = get_file(sbi->pipe);
mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 1b35d6bd06b..922ad460bff 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -16,7 +16,7 @@
#include <linux/poll.h>
-static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
{
return -EIO;
}
@@ -173,13 +173,13 @@ static const struct file_operations bad_file_ops =
};
static int bad_inode_create (struct inode *dir, struct dentry *dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, bool excl)
{
return -EIO;
}
static struct dentry *bad_inode_lookup(struct inode *dir,
- struct dentry *dentry, struct nameidata *nd)
+ struct dentry *dentry, unsigned int flags)
{
return ERR_PTR(-EIO);
}
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index d9a40abda6b..b2664283915 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -20,8 +20,8 @@ typedef u64 befs_blocknr_t;
*/
typedef struct befs_mount_options {
- gid_t gid;
- uid_t uid;
+ kgid_t gid;
+ kuid_t uid;
int use_gid;
int use_uid;
int debug;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e18da23d42b..2b3bda8d5e6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -15,6 +15,7 @@
#include <linux/vfs.h>
#include <linux/parser.h>
#include <linux/namei.h>
+#include <linux/sched.h>
#include "befs.h"
#include "btree.h"
@@ -34,7 +35,7 @@ static int befs_readdir(struct file *, void *, filldir_t);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
static int befs_readpage(struct file *file, struct page *page);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
-static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int);
static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
@@ -159,7 +160,7 @@ befs_get_block(struct inode *inode, sector_t block,
}
static struct dentry *
-befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode = NULL;
struct super_block *sb = dir->i_sb;
@@ -352,9 +353,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
*/
inode->i_uid = befs_sb->mount_opts.use_uid ?
- befs_sb->mount_opts.uid : (uid_t) fs32_to_cpu(sb, raw_inode->uid);
+ befs_sb->mount_opts.uid :
+ make_kuid(&init_user_ns, fs32_to_cpu(sb, raw_inode->uid));
inode->i_gid = befs_sb->mount_opts.use_gid ?
- befs_sb->mount_opts.gid : (gid_t) fs32_to_cpu(sb, raw_inode->gid);
+ befs_sb->mount_opts.gid :
+ make_kgid(&init_user_ns, fs32_to_cpu(sb, raw_inode->gid));
set_nlink(inode, 1);
@@ -454,6 +457,11 @@ befs_init_inodecache(void)
static void
befs_destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(befs_inode_cachep);
}
@@ -674,10 +682,12 @@ parse_options(char *options, befs_mount_options * opts)
char *p;
substring_t args[MAX_OPT_ARGS];
int option;
+ kuid_t uid;
+ kgid_t gid;
/* Initialize options */
- opts->uid = 0;
- opts->gid = 0;
+ opts->uid = GLOBAL_ROOT_UID;
+ opts->gid = GLOBAL_ROOT_GID;
opts->use_uid = 0;
opts->use_gid = 0;
opts->iocharset = NULL;
@@ -696,23 +706,29 @@ parse_options(char *options, befs_mount_options * opts)
case Opt_uid:
if (match_int(&args[0], &option))
return 0;
- if (option < 0) {
+ uid = INVALID_UID;
+ if (option >= 0)
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid)) {
printk(KERN_ERR "BeFS: Invalid uid %d, "
"using default\n", option);
break;
}
- opts->uid = option;
+ opts->uid = uid;
opts->use_uid = 1;
break;
case Opt_gid:
if (match_int(&args[0], &option))
return 0;
- if (option < 0) {
+ gid = INVALID_GID;
+ if (option >= 0)
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid)) {
printk(KERN_ERR "BeFS: Invalid gid %d, "
"using default\n", option);
break;
}
- opts->gid = option;
+ opts->gid = gid;
opts->use_gid = 1;
break;
case Opt_charset:
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d12c7966db2..2785ef91191 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -85,7 +85,7 @@ const struct file_operations bfs_dir_operations = {
extern void dump_imap(const char *, struct super_block *);
static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
int err;
struct inode *inode;
@@ -133,7 +133,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
}
static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct inode *inode = NULL;
struct buffer_head *bh;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index f20e8a71062..ad3ea1497cc 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, bfs_get_block);
}
+static void bfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int bfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
bfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ bfs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 9870417c26e..737aaa3f709 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -76,8 +76,8 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock);
BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock);
BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
- inode->i_uid = le32_to_cpu(di->i_uid);
- inode->i_gid = le32_to_cpu(di->i_gid);
+ i_uid_write(inode, le32_to_cpu(di->i_uid));
+ i_gid_write(inode, le32_to_cpu(di->i_gid));
set_nlink(inode, le32_to_cpu(di->i_nlink));
inode->i_size = BFS_FILESIZE(di);
inode->i_blocks = BFS_FILEBLOCKS(di);
@@ -139,8 +139,8 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_ino = cpu_to_le16(ino);
di->i_mode = cpu_to_le32(inode->i_mode);
- di->i_uid = cpu_to_le32(inode->i_uid);
- di->i_gid = cpu_to_le32(inode->i_gid);
+ di->i_uid = cpu_to_le32(i_uid_read(inode));
+ di->i_gid = cpu_to_le32(i_gid_read(inode));
di->i_nlink = cpu_to_le32(inode->i_nlink);
di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -280,6 +280,11 @@ static int init_inodecache(void)
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(bfs_inode_cachep);
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index d146e181d10..6043567b95c 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,33 +30,10 @@
#include <asm/cacheflush.h>
#include <asm/a.out-core.h>
-static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
-static int aout_core_dump(struct coredump_params *cprm);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
-};
-
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end > start) {
- unsigned long addr;
- addr = vm_brk(start, end - start);
- if (BAD_ADDR(addr))
- return addr;
- }
- return 0;
-}
+#ifdef CONFIG_COREDUMP
/*
* Routine writes a core dump image in the current directory.
* Currently only a stub-function.
@@ -66,7 +43,6 @@ static int set_brk(unsigned long start, unsigned long end)
* field, which also makes sure the core-dumps won't be recursive if the
* dumping of the process results in another error..
*/
-
static int aout_core_dump(struct coredump_params *cprm)
{
struct file *file = cprm->file;
@@ -89,7 +65,7 @@ static int aout_core_dump(struct coredump_params *cprm)
current->flags |= PF_DUMPCORE;
strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->signr;
+ dump.signal = cprm->siginfo->si_signo;
aout_dump_thread(cprm->regs, &dump);
/* If the size of the dump file exceeds the rlimit, then see what would happen
@@ -135,6 +111,32 @@ end_coredump:
set_fs(fs);
return has_dumped;
}
+#else
+#define aout_core_dump NULL
+#endif
+
+static struct linux_binfmt aout_format = {
+ .module = THIS_MODULE,
+ .load_binary = load_aout_binary,
+ .load_shlib = load_aout_library,
+ .core_dump = aout_core_dump,
+ .min_coredump = PAGE_SIZE
+};
+
+#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+ start = PAGE_ALIGN(start);
+ end = PAGE_ALIGN(end);
+ if (end > start) {
+ unsigned long addr;
+ addr = vm_brk(start, end - start);
+ if (BAD_ADDR(addr))
+ return addr;
+ }
+ return 0;
+}
/*
* create_aout_tables() parses the env- and arg-strings in new user
@@ -199,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
* libraries. There is no binary dependent code anywhere else.
*/
-static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_aout_binary(struct linux_binprm * bprm)
{
+ struct pt_regs *regs = current_pt_regs();
struct exec ex;
unsigned long error;
unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b52956afe3..0c42cdbabec 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -27,6 +27,7 @@
#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/random.h>
#include <linux/elf.h>
@@ -35,9 +36,15 @@
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
-#include <asm/exec.h>
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+#ifndef user_long_t
+#define user_long_t long
+#endif
+#ifndef user_siginfo_t
+#define user_siginfo_t siginfo_t
+#endif
+
+static int load_elf_binary(struct linux_binprm *bprm);
static int load_elf_library(struct file *);
static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
int, int, unsigned long);
@@ -551,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
#endif
}
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
unsigned long load_addr = 0, load_bias = 0;
@@ -568,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
unsigned long def_flags = 0;
+ struct pt_regs *regs = current_pt_regs();
struct {
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
@@ -881,7 +889,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
}
if (elf_interpreter) {
- unsigned long uninitialized_var(interp_map_addr);
+ unsigned long interp_map_addr = 0;
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
@@ -1115,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
if (always_dump_vma(vma))
goto whole;
- if (vma->vm_flags & VM_NODUMP)
+ if (vma->vm_flags & VM_DONTDUMP)
return 0;
/* Hugetlb memory check */
@@ -1127,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
}
/* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & (VM_IO | VM_RESERVED))
+ if (vma->vm_flags & VM_IO)
return 0;
/* By default, dump shared memory if mapped from an anonymous file. */
@@ -1372,6 +1380,103 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
}
+static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
+ siginfo_t *siginfo)
+{
+ mm_segment_t old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
+ set_fs(old_fs);
+ fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+}
+
+#define MAX_FILE_NOTE_SIZE (4*1024*1024)
+/*
+ * Format of NT_FILE note:
+ *
+ * long count -- how many files are mapped
+ * long page_size -- units for file_ofs
+ * array of [COUNT] elements of
+ * long start
+ * long end
+ * long file_ofs
+ * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
+ */
+static void fill_files_note(struct memelfnote *note)
+{
+ struct vm_area_struct *vma;
+ unsigned count, size, names_ofs, remaining, n;
+ user_long_t *data;
+ user_long_t *start_end_ofs;
+ char *name_base, *name_curpos;
+
+ /* *Estimated* file count and total data size needed */
+ count = current->mm->map_count;
+ size = count * 64;
+
+ names_ofs = (2 + 3 * count) * sizeof(data[0]);
+ alloc:
+ if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+ goto err;
+ size = round_up(size, PAGE_SIZE);
+ data = vmalloc(size);
+ if (!data)
+ goto err;
+
+ start_end_ofs = data + 2;
+ name_base = name_curpos = ((char *)data) + names_ofs;
+ remaining = size - names_ofs;
+ count = 0;
+ for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+ struct file *file;
+ const char *filename;
+
+ file = vma->vm_file;
+ if (!file)
+ continue;
+ filename = d_path(&file->f_path, name_curpos, remaining);
+ if (IS_ERR(filename)) {
+ if (PTR_ERR(filename) == -ENAMETOOLONG) {
+ vfree(data);
+ size = size * 5 / 4;
+ goto alloc;
+ }
+ continue;
+ }
+
+ /* d_path() fills at the end, move name down */
+ /* n = strlen(filename) + 1: */
+ n = (name_curpos + remaining) - filename;
+ remaining = filename - name_curpos;
+ memmove(name_curpos, filename, n);
+ name_curpos += n;
+
+ *start_end_ofs++ = vma->vm_start;
+ *start_end_ofs++ = vma->vm_end;
+ *start_end_ofs++ = vma->vm_pgoff;
+ count++;
+ }
+
+ /* Now we know exact count of files, can store it */
+ data[0] = count;
+ data[1] = PAGE_SIZE;
+ /*
+ * Count usually is less than current->mm->map_count,
+ * we need to move filenames down.
+ */
+ n = current->mm->map_count - count;
+ if (n != 0) {
+ unsigned shift_bytes = n * 3 * sizeof(data[0]);
+ memmove(name_base - shift_bytes, name_base,
+ name_curpos - name_base);
+ name_curpos -= shift_bytes;
+ }
+
+ size = name_curpos - (char *)data;
+ fill_note(note, "CORE", NT_FILE, size, data);
+ err: ;
+}
+
#ifdef CORE_DUMP_USE_REGSET
#include <linux/regset.h>
@@ -1385,7 +1490,10 @@ struct elf_thread_core_info {
struct elf_note_info {
struct elf_thread_core_info *thread;
struct memelfnote psinfo;
+ struct memelfnote signote;
struct memelfnote auxv;
+ struct memelfnote files;
+ user_siginfo_t csigdata;
size_t size;
int thread_notes;
};
@@ -1480,7 +1588,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- long signr, struct pt_regs *regs)
+ siginfo_t *siginfo, struct pt_regs *regs)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1493,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
info->thread = NULL;
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
- if (psinfo == NULL)
+ if (psinfo == NULL) {
+ info->psinfo.data = NULL; /* So we don't free this wrongly */
return 0;
+ }
fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
@@ -1550,7 +1660,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, signr, &info->size))
+ if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
return 0;
/*
@@ -1559,9 +1669,15 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ info->size += notesize(&info->signote);
+
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
+ fill_files_note(&info->files);
+ info->size += notesize(&info->files);
+
return 1;
}
@@ -1588,8 +1704,12 @@ static int write_note_info(struct elf_note_info *info,
if (first && !writenote(&info->psinfo, file, foffset))
return 0;
+ if (first && !writenote(&info->signote, file, foffset))
+ return 0;
if (first && !writenote(&info->auxv, file, foffset))
return 0;
+ if (first && !writenote(&info->files, file, foffset))
+ return 0;
for (i = 1; i < info->thread_notes; ++i)
if (t->notes[i].data &&
@@ -1616,6 +1736,7 @@ static void free_note_info(struct elf_note_info *info)
kfree(t);
}
kfree(info->psinfo.data);
+ vfree(info->files.data);
}
#else
@@ -1681,6 +1802,7 @@ struct elf_note_info {
#ifdef ELF_CORE_COPY_XFPREGS
elf_fpxregset_t *xfpu;
#endif
+ user_siginfo_t csigdata;
int thread_status_size;
int numnote;
};
@@ -1690,48 +1812,37 @@ static int elf_note_info_init(struct elf_note_info *info)
memset(info, 0, sizeof(*info));
INIT_LIST_HEAD(&info->thread_list);
- /* Allocate space for six ELF notes */
- info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
+ /* Allocate space for ELF notes */
+ info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL);
if (!info->notes)
return 0;
info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
if (!info->psinfo)
- goto notes_free;
+ return 0;
info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
if (!info->prstatus)
- goto psinfo_free;
+ return 0;
info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
if (!info->fpu)
- goto prstatus_free;
+ return 0;
#ifdef ELF_CORE_COPY_XFPREGS
info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
if (!info->xfpu)
- goto fpu_free;
+ return 0;
#endif
return 1;
-#ifdef ELF_CORE_COPY_XFPREGS
- fpu_free:
- kfree(info->fpu);
-#endif
- prstatus_free:
- kfree(info->prstatus);
- psinfo_free:
- kfree(info->psinfo);
- notes_free:
- kfree(info->notes);
- return 0;
}
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- long signr, struct pt_regs *regs)
+ siginfo_t *siginfo, struct pt_regs *regs)
{
struct list_head *t;
if (!elf_note_info_init(info))
return 0;
- if (signr) {
+ if (siginfo->si_signo) {
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -1749,13 +1860,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
int sz;
ets = list_entry(t, struct elf_thread_status, list);
- sz = elf_dump_thread_status(signr, ets);
+ sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(info->prstatus, current, signr);
+ fill_prstatus(info->prstatus, current, siginfo->si_signo);
elf_core_copy_regs(&info->prstatus->pr_reg, regs);
/* Set up header */
@@ -1772,9 +1883,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- info->numnote = 2;
+ fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_auxv_note(info->notes + 3, current->mm);
+ fill_files_note(info->notes + 4);
- fill_auxv_note(&info->notes[info->numnote++], current->mm);
+ info->numnote = 5;
/* Try to dump the FPU. */
info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
@@ -1836,6 +1949,9 @@ static void free_note_info(struct elf_note_info *info)
kfree(list_entry(tmp, struct elf_thread_status, list));
}
+ /* Free data allocated by fill_files_note(): */
+ vfree(info->notes[4].data);
+
kfree(info->prstatus);
kfree(info->psinfo);
kfree(info->notes);
@@ -1962,7 +2078,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
+ if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
goto cleanup;
has_dumped = 1;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3d77cf81ba3..dc84732e554 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -39,7 +39,6 @@
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/pgalloc.h>
-#include <asm/exec.h>
typedef char *elf_caddr_t;
@@ -57,7 +56,7 @@ typedef char *elf_caddr_t;
MODULE_LICENSE("GPL");
-static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
+static int load_elf_fdpic_binary(struct linux_binprm *);
static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
struct mm_struct *, const char *);
@@ -165,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
/*
* load an fdpic binary into various bits of memory
*/
-static int load_elf_fdpic_binary(struct linux_binprm *bprm,
- struct pt_regs *regs)
+static int load_elf_fdpic_binary(struct linux_binprm *bprm)
{
struct elf_fdpic_params exec_params, interp_params;
+ struct pt_regs *regs = current_pt_regs();
struct elf_phdr *phdr;
unsigned long stack_size, entryaddr;
#ifdef ELF_FDPIC_PLAT_INIT
@@ -1205,7 +1204,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
int dump_ok;
/* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
+ if (vma->vm_flags & VM_IO) {
kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
return 0;
}
@@ -1642,7 +1641,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto cleanup;
#endif
- if (cprm->signr) {
+ if (cprm->siginfo->si_signo) {
struct core_thread *ct;
struct elf_thread_status *tmp;
@@ -1661,13 +1660,13 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
int sz;
tmp = list_entry(t, struct elf_thread_status, list);
- sz = elf_dump_thread_status(cprm->signr, tmp);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
thread_status_size += sz;
}
}
/* now collect the dump for the current */
- fill_prstatus(prstatus, current, cprm->signr);
+ fill_prstatus(prstatus, current, cprm->siginfo->si_signo);
elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
segs = current->mm->map_count;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912..037a3e2b045 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
#define EM86_INTERP "/usr/bin/em86"
#define EM86_I_NAME "em86"
-static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_em86(struct linux_binprm *bprm)
{
char *interp, *i_name, *i_arg;
struct file * file;
@@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
return -ENOEXEC;
}
- bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
@@ -90,7 +89,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
if (retval < 0)
return retval;
- return search_binary_handler(bprm, regs);
+ return search_binary_handler(bprm);
}
static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 178cb70acc2..b56371981d1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
static int load_flat_shared_library(int id, struct lib_info *p);
#endif
-static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_flat_binary(struct linux_binprm *);
static int flat_core_dump(struct coredump_params *cprm);
static struct linux_binfmt flat_format = {
@@ -107,7 +107,7 @@ static struct linux_binfmt flat_format = {
static int flat_core_dump(struct coredump_params *cprm)
{
printk("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, (int) cprm->signr);
+ current->comm, current->pid, (int) cprm->siginfo->si_signo);
return(1);
}
@@ -858,9 +858,10 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_flat_binary(struct linux_binprm * bprm)
{
struct lib_info libinfo;
+ struct pt_regs *regs = current_pt_regs();
unsigned long p = bprm->p;
unsigned long stack_len;
unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca6..0c8869fdd14 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
/*
* the loader itself
*/
-static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_misc_binary(struct linux_binprm *bprm)
{
Node *fmt;
struct file * interp_file = NULL;
@@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (!enabled)
goto _ret;
- retval = -ENOEXEC;
- if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
- goto _ret;
-
/* to keep locking time low, we copy the interpreter string */
read_lock(&entries_lock);
fmt = check_file(bprm);
@@ -176,7 +172,10 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto _error;
bprm->argc ++;
- bprm->interp = iname; /* for binfmt_script */
+ /* Update interp in case binfmt_script needs it. */
+ retval = bprm_change_interp(iname, bprm);
+ if (retval < 0)
+ goto _error;
interp_file = open_exec (iname);
retval = PTR_ERR (interp_file);
@@ -197,9 +196,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (retval < 0)
goto _error;
- bprm->recursion_depth++;
-
- retval = search_binary_handler (bprm, regs);
+ retval = search_binary_handler(bprm);
if (retval < 0)
goto _error;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f6315..5027a3e1492 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
#include <linux/err.h>
#include <linux/fs.h>
-static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
char *cp;
@@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
char interp[BINPRM_BUF_SIZE];
int retval;
- if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
- (bprm->recursion_depth > BINPRM_MAX_RECURSION))
+ if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
/*
* This section does the #! interpretation.
* Sorta complicated, but hopefully it will work. -TYT
*/
- bprm->recursion_depth++;
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
@@ -82,7 +80,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
retval = copy_strings_kernel(1, &i_name, bprm);
if (retval) return retval;
bprm->argc++;
- bprm->interp = interp;
+ retval = bprm_change_interp(interp, bprm);
+ if (retval < 0)
+ return retval;
/*
* OK, now restart the process with the interpreter's dentry.
@@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
retval = prepare_binprm(bprm);
if (retval < 0)
return retval;
- return search_binary_handler(bprm,regs);
+ return search_binary_handler(bprm);
}
static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b..4e00ed68d4a 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
#include <linux/elf.h>
-static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs);
+static int load_som_binary(struct linux_binprm * bprm);
static int load_som_library(struct file *);
/*
@@ -180,13 +180,14 @@ out:
*/
static int
-load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+load_som_binary(struct linux_binprm * bprm)
{
int retval;
unsigned int size;
unsigned long som_entry;
struct som_hdr *som_ex;
struct som_exec_auxhdr *hpuxhdr;
+ struct pt_regs *regs = current_pt_regs();
/* Get the exec-header */
som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e85c04b9f61..a3f28f331b2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx)
}
/**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
* @gfp_mask: Memory allocation mask
* @nr_vecs: Number of integrity metadata scatter-gather elements
- * @bs: bio_set to allocate from
*
* Description: This function prepares a bio for attaching integrity
* metadata. nr_vecs specifies the maximum number of pages containing
* integrity metadata that can be attached.
*/
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
- gfp_t gfp_mask,
- unsigned int nr_vecs,
- struct bio_set *bs)
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs)
{
struct bio_integrity_payload *bip;
unsigned int idx = vecs_to_idx(nr_vecs);
+ struct bio_set *bs = bio->bi_pool;
+
+ if (!bs)
+ bs = fs_bio_set;
BUG_ON(bio == NULL);
bip = NULL;
@@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
return bip;
}
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio: bio to attach integrity metadata to
- * @gfp_mask: Memory allocation mask
- * @nr_vecs: Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata. nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
- gfp_t gfp_mask,
- unsigned int nr_vecs)
-{
- return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
EXPORT_SYMBOL(bio_integrity_alloc);
/**
* bio_integrity_free - Free bio integrity payload
* @bio: bio containing bip to be freed
- * @bs: bio_set this bio was allocated from
*
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct bio_set *bs = bio->bi_pool;
+
+ if (!bs)
+ bs = fs_bio_set;
BUG_ON(bip == NULL);
@@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split);
* @bio: New bio
* @bio_src: Original bio
* @gfp_mask: Memory allocation mask
- * @bs: bio_set to allocate bip from
*
* Description: Called to allocate a bip when cloning a bio
*/
int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
- gfp_t gfp_mask, struct bio_set *bs)
+ gfp_t gfp_mask)
{
struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
struct bio_integrity_payload *bip;
BUG_ON(bip_src == NULL);
- bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
+ bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
if (bip == NULL)
return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 73922abba83..b96fc6ce485 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
* IO code that does not need private memory pools.
*/
struct bio_set *fs_bio_set;
+EXPORT_SYMBOL(fs_bio_set);
/*
* Our slab pool management
@@ -73,7 +74,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
{
unsigned int sz = sizeof(struct bio) + extra_size;
struct kmem_cache *slab = NULL;
- struct bio_slab *bslab;
+ struct bio_slab *bslab, *new_bio_slabs;
+ unsigned int new_bio_slab_max;
unsigned int i, entry = -1;
mutex_lock(&bio_slab_lock);
@@ -96,12 +98,14 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
goto out_unlock;
if (bio_slab_nr == bio_slab_max && entry == -1) {
- bio_slab_max <<= 1;
- bio_slabs = krealloc(bio_slabs,
- bio_slab_max * sizeof(struct bio_slab),
- GFP_KERNEL);
- if (!bio_slabs)
+ new_bio_slab_max = bio_slab_max << 1;
+ new_bio_slabs = krealloc(bio_slabs,
+ new_bio_slab_max * sizeof(struct bio_slab),
+ GFP_KERNEL);
+ if (!new_bio_slabs)
goto out_unlock;
+ bio_slab_max = new_bio_slab_max;
+ bio_slabs = new_bio_slabs;
}
if (entry == -1)
entry = bio_slab_nr++;
@@ -232,26 +236,37 @@ fallback:
return bvl;
}
-void bio_free(struct bio *bio, struct bio_set *bs)
+static void __bio_free(struct bio *bio)
{
+ bio_disassociate_task(bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_free(bio);
+}
+
+static void bio_free(struct bio *bio)
+{
+ struct bio_set *bs = bio->bi_pool;
void *p;
- if (bio_has_allocated_vec(bio))
- bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+ __bio_free(bio);
- if (bio_integrity(bio))
- bio_integrity_free(bio, bs);
+ if (bs) {
+ if (bio_has_allocated_vec(bio))
+ bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
- /*
- * If we have front padding, adjust the bio pointer before freeing
- */
- p = bio;
- if (bs->front_pad)
+ /*
+ * If we have front padding, adjust the bio pointer before freeing
+ */
+ p = bio;
p -= bs->front_pad;
- mempool_free(p, bs->bio_pool);
+ mempool_free(p, bs->bio_pool);
+ } else {
+ /* Bio was allocated by bio_kmalloc() */
+ kfree(bio);
+ }
}
-EXPORT_SYMBOL(bio_free);
void bio_init(struct bio *bio)
{
@@ -262,48 +277,85 @@ void bio_init(struct bio *bio)
EXPORT_SYMBOL(bio_init);
/**
+ * bio_reset - reinitialize a bio
+ * @bio: bio to reset
+ *
+ * Description:
+ * After calling bio_reset(), @bio will be in the same state as a freshly
+ * allocated bio returned bio bio_alloc_bioset() - the only fields that are
+ * preserved are the ones that are initialized by bio_alloc_bioset(). See
+ * comment in struct bio.
+ */
+void bio_reset(struct bio *bio)
+{
+ unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
+
+ __bio_free(bio);
+
+ memset(bio, 0, BIO_RESET_BYTES);
+ bio->bi_flags = flags|(1 << BIO_UPTODATE);
+}
+EXPORT_SYMBOL(bio_reset);
+
+/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Description:
- * bio_alloc_bioset will try its own mempool to satisfy the allocation.
- * If %__GFP_WAIT is set then we will block on the internal pool waiting
- * for a &struct bio to become free.
+ * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
+ * backed by the @bs's mempool.
*
- * Note that the caller must set ->bi_destructor on successful return
- * of a bio, to do the appropriate freeing of the bio once the reference
- * count drops to zero.
- **/
+ * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
+ * able to allocate a bio. This is due to the mempool guarantees. To make this
+ * work, callers must never allocate more than 1 bio at a time from this pool.
+ * Callers that need to allocate more than 1 bio must always submit the
+ * previously allocated bio for IO before attempting to allocate a new one.
+ * Failure to do so can cause deadlocks under memory pressure.
+ *
+ * RETURNS:
+ * Pointer to new bio on success, NULL on failure.
+ */
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
+ unsigned front_pad;
+ unsigned inline_vecs;
unsigned long idx = BIO_POOL_NONE;
struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
+ if (!bs) {
+ if (nr_iovecs > UIO_MAXIOV)
+ return NULL;
+
+ p = kmalloc(sizeof(struct bio) +
+ nr_iovecs * sizeof(struct bio_vec),
+ gfp_mask);
+ front_pad = 0;
+ inline_vecs = nr_iovecs;
+ } else {
+ p = mempool_alloc(bs->bio_pool, gfp_mask);
+ front_pad = bs->front_pad;
+ inline_vecs = BIO_INLINE_VECS;
+ }
+
if (unlikely(!p))
return NULL;
- bio = p + bs->front_pad;
+ bio = p + front_pad;
bio_init(bio);
- if (unlikely(!nr_iovecs))
- goto out_set;
-
- if (nr_iovecs <= BIO_INLINE_VECS) {
- bvl = bio->bi_inline_vecs;
- nr_iovecs = BIO_INLINE_VECS;
- } else {
+ if (nr_iovecs > inline_vecs) {
bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
if (unlikely(!bvl))
goto err_free;
-
- nr_iovecs = bvec_nr_vecs(idx);
+ } else if (nr_iovecs) {
+ bvl = bio->bi_inline_vecs;
}
-out_set:
+
+ bio->bi_pool = bs;
bio->bi_flags |= idx << BIO_POOL_OFFSET;
bio->bi_max_vecs = nr_iovecs;
bio->bi_io_vec = bvl;
@@ -315,80 +367,6 @@ err_free:
}
EXPORT_SYMBOL(bio_alloc_bioset);
-static void bio_fs_destructor(struct bio *bio)
-{
- bio_free(bio, fs_bio_set);
-}
-
-/**
- * bio_alloc - allocate a new bio, memory pool backed
- * @gfp_mask: allocation mask to use
- * @nr_iovecs: number of iovecs
- *
- * bio_alloc will allocate a bio and associated bio_vec array that can hold
- * at least @nr_iovecs entries. Allocations will be done from the
- * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
- *
- * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- * a bio. This is due to the mempool guarantees. To make this work, callers
- * must never allocate more than 1 bio at a time from this pool. Callers
- * that need to allocate more than 1 bio must always submit the previously
- * allocated bio for IO before attempting to allocate a new one. Failure to
- * do so can cause livelocks under memory pressure.
- *
- * RETURNS:
- * Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
- if (bio)
- bio->bi_destructor = bio_fs_destructor;
-
- return bio;
-}
-EXPORT_SYMBOL(bio_alloc);
-
-static void bio_kmalloc_destructor(struct bio *bio)
-{
- if (bio_integrity(bio))
- bio_integrity_free(bio, fs_bio_set);
- kfree(bio);
-}
-
-/**
- * bio_kmalloc - allocate a bio for I/O using kmalloc()
- * @gfp_mask: the GFP_ mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
- *
- * Description:
- * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
- * %__GFP_WAIT, the allocation is guaranteed to succeed.
- *
- **/
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
-
- bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
- if (unlikely(!bio))
- return NULL;
-
- bio_init(bio);
- bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bio->bi_inline_vecs;
- bio->bi_destructor = bio_kmalloc_destructor;
-
- return bio;
-}
-EXPORT_SYMBOL(bio_kmalloc);
-
void zero_fill_bio(struct bio *bio)
{
unsigned long flags;
@@ -419,11 +397,8 @@ void bio_put(struct bio *bio)
/*
* last put frees it
*/
- if (atomic_dec_and_test(&bio->bi_cnt)) {
- bio_disassociate_task(bio);
- bio->bi_next = NULL;
- bio->bi_destructor(bio);
- }
+ if (atomic_dec_and_test(&bio->bi_cnt))
+ bio_free(bio);
}
EXPORT_SYMBOL(bio_put);
@@ -465,26 +440,28 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
EXPORT_SYMBOL(__bio_clone);
/**
- * bio_clone - clone a bio
+ * bio_clone_bioset - clone a bio
* @bio: bio to clone
* @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
*
* Like __bio_clone, only also allocates the returned bio
*/
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+ struct bio_set *bs)
{
- struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
+ struct bio *b;
+ b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
if (!b)
return NULL;
- b->bi_destructor = bio_fs_destructor;
__bio_clone(b, bio);
if (bio_integrity(bio)) {
int ret;
- ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+ ret = bio_integrity_clone(b, bio, gfp_mask);
if (ret < 0) {
bio_put(b);
@@ -494,7 +471,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
return b;
}
-EXPORT_SYMBOL(bio_clone);
+EXPORT_SYMBOL(bio_clone_bioset);
/**
* bio_get_nr_vecs - return approx number of vecs
@@ -1312,7 +1289,7 @@ EXPORT_SYMBOL(bio_copy_kern);
* Note that this code is very hard to test under normal circumstances because
* direct-io pins the pages with get_user_pages(). This makes
* is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, pdflush) could clean the pages if they are mapped
+ * But other code (eg, flusher threads) could clean the pages if they are mapped
* pagecache.
*
* Simply disabling the call to bio_set_pages_dirty() is a good way to test the
@@ -1500,7 +1477,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
bi->bi_sector + first_sectors);
- BUG_ON(bi->bi_vcnt != 1);
+ BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
BUG_ON(bi->bi_idx != 0);
atomic_set(&bp->cnt, 3);
bp->error = 0;
@@ -1510,17 +1487,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
bp->bio2.bi_size -= first_sectors << 9;
bp->bio1.bi_size = first_sectors << 9;
- bp->bv1 = bi->bi_io_vec[0];
- bp->bv2 = bi->bi_io_vec[0];
- bp->bv2.bv_offset += first_sectors << 9;
- bp->bv2.bv_len -= first_sectors << 9;
- bp->bv1.bv_len = first_sectors << 9;
+ if (bi->bi_vcnt != 0) {
+ bp->bv1 = bi->bi_io_vec[0];
+ bp->bv2 = bi->bi_io_vec[0];
+
+ if (bio_is_rw(bi)) {
+ bp->bv2.bv_offset += first_sectors << 9;
+ bp->bv2.bv_len -= first_sectors << 9;
+ bp->bv1.bv_len = first_sectors << 9;
+ }
- bp->bio1.bi_io_vec = &bp->bv1;
- bp->bio2.bi_io_vec = &bp->bv2;
+ bp->bio1.bi_io_vec = &bp->bv1;
+ bp->bio2.bi_io_vec = &bp->bv2;
- bp->bio1.bi_max_vecs = 1;
- bp->bio2.bi_max_vecs = 1;
+ bp->bio1.bi_max_vecs = 1;
+ bp->bio2.bi_max_vecs = 1;
+ }
bp->bio1.bi_end_io = bio_pair_end_1;
bp->bio2.bi_end_io = bio_pair_end_2;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c2bbe1fb132..172f8491a2b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
spin_unlock(&dst->wb.list_lock);
}
-sector_t blkdev_max_block(struct block_device *bdev)
-{
- sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
-
- if (sz) {
- unsigned int size = block_size(bdev);
- unsigned int sizebits = blksize_bits(size);
- retval = (sz >> sizebits);
- }
- return retval;
-}
-
/* Kill _all_ buffers and pagecache , dirty or not.. */
void kill_bdev(struct block_device *bdev)
{
@@ -163,52 +150,12 @@ static int
blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
- if (iblock >= blkdev_max_block(I_BDEV(inode))) {
- if (create)
- return -EIO;
-
- /*
- * for reads, we're just trying to fill a partial page.
- * return a hole, they will have to call get_block again
- * before they can fill it, and they will get -EIO at that
- * time
- */
- return 0;
- }
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- sector_t end_block = blkdev_max_block(I_BDEV(inode));
- unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
- if ((iblock + max_blocks) > end_block) {
- max_blocks = end_block - iblock;
- if ((long)max_blocks <= 0) {
- if (create)
- return -EIO; /* write fully beyond EOF */
- /*
- * It is a read which is fully beyond EOF. We return
- * a !buffer_mapped buffer
- */
- max_blocks = 0;
- }
- }
-
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- bh->b_size = max_blocks << inode->i_blkbits;
- if (max_blocks)
- set_buffer_mapped(bh);
- return 0;
-}
-
static ssize_t
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
@@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct inode *inode = file->f_mapping->host;
return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
- nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+ nr_segs, blkdev_get_block, NULL, NULL, 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -374,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
* for a block special file file->f_path.dentry->d_inode->i_size is zero
* so we compute the size by hand (just as in block_read/write above)
*/
-static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *bd_inode = file->f_mapping->host;
loff_t size;
@@ -384,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
size = i_size_read(bd_inode);
retval = -EINVAL;
- switch (origin) {
+ switch (whence) {
case SEEK_END:
offset += size;
break;
@@ -1578,10 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
+ blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
@@ -1590,10 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_write);
+static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *bd_inode = file->f_mapping->host;
+ loff_t size = i_size_read(bd_inode);
+
+ if (pos >= size)
+ return 0;
+
+ size -= pos;
+ if (size < INT_MAX)
+ nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
+ return generic_file_aio_read(iocb, iov, nr_segs, pos);
+}
+
/*
* Try to release a page associated with block device when the system
* is under memory pressure.
@@ -1624,7 +1590,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .aio_read = blkdev_aio_read,
.aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
@@ -1710,3 +1676,39 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
return res;
}
EXPORT_SYMBOL(__invalidate_device);
+
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+ struct inode *inode, *old_inode = NULL;
+
+ spin_lock(&inode_sb_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+ struct address_space *mapping = inode->i_mapping;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+ mapping->nrpages == 0) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_sb_list_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have been
+ * removed from s_inodes list while we dropped the
+ * inode_sb_list_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it under
+ * inode_sb_list_lock. So we keep the reference and iput it
+ * later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ func(I_BDEV(inode), arg);
+
+ spin_lock(&inode_sb_list_lock);
+ }
+ spin_unlock(&inode_sb_list_lock);
+ iput(old_inode);
+}
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4fa2befae..7df3e0f0ee5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o
+ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 761e2cd8fed..e15d2b0d8d3 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -61,7 +61,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, value, size);
}
if (size > 0) {
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
} else if (size == -ENOENT || size == -ENODATA || size == 0) {
/* FIXME, who returns -ENOENT? I think nobody */
acl = NULL;
@@ -91,7 +91,7 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
return PTR_ERR(acl);
if (acl == NULL)
return -ENODATA;
- ret = posix_acl_to_xattr(acl, value, size);
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
posix_acl_release(acl);
return ret;
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
return ret;
+ if (ret == 0)
+ acl = NULL;
}
ret = 0;
break;
@@ -141,7 +143,7 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
goto out;
}
- ret = posix_acl_to_xattr(acl, value, size);
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (ret < 0)
goto out;
}
@@ -169,7 +171,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
return -EOPNOTSUPP;
if (value) {
- acl = posix_acl_from_xattr(value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 42704149b72..58b7d14b08e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -206,10 +206,17 @@ static noinline void run_ordered_completions(struct btrfs_workers *workers,
work->ordered_func(work);
- /* now take the lock again and call the freeing code */
+ /* now take the lock again and drop our item from the list */
spin_lock(&workers->order_lock);
list_del(&work->order_list);
+ spin_unlock(&workers->order_lock);
+
+ /*
+ * we don't want to call the ordered free functions
+ * with the lock held though
+ */
work->ordered_free(work);
+ spin_lock(&workers->order_lock);
}
spin_unlock(&workers->order_lock);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a383c18e74e..04edf69be87 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
@@ -231,7 +232,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
if (!ret) {
ret = ulist_add(parents, eb->start,
- (unsigned long)eie, GFP_NOFS);
+ (uintptr_t)eie, GFP_NOFS);
if (ret < 0)
break;
if (!extent_item_pos) {
@@ -282,9 +283,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- rcu_read_lock();
- root_level = btrfs_header_level(root->node);
- rcu_read_unlock();
+ root_level = btrfs_old_root_level(root, time_seq);
if (root_level + 1 == level)
goto out;
@@ -363,8 +362,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&uiter);
node = ulist_next(parents, &uiter);
ref->parent = node ? node->val : 0;
- ref->inode_list =
- node ? (struct extent_inode_elem *)node->aux : 0;
+ ref->inode_list = node ?
+ (struct extent_inode_elem *)(uintptr_t)node->aux : 0;
/* additional parents require new refs being added here */
while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +374,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
- new_ref->inode_list =
- (struct extent_inode_elem *)node->aux;
+ new_ref->inode_list = (struct extent_inode_elem *)
+ (uintptr_t)node->aux;
list_add(&new_ref->list, &ref->list);
}
ulist_reinit(parents);
@@ -462,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
pos2 = n2, n2 = pos2->next) {
struct __prelim_ref *ref2;
struct __prelim_ref *xchg;
+ struct extent_inode_elem *eie;
ref2 = list_entry(pos2, struct __prelim_ref, list);
@@ -473,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
ref1 = ref2;
ref2 = xchg;
}
- ref1->count += ref2->count;
} else {
if (ref1->parent != ref2->parent)
continue;
- ref1->count += ref2->count;
}
+
+ eie = ref1->inode_list;
+ while (eie && eie->next)
+ eie = eie->next;
+ if (eie)
+ eie->next = ref2->inode_list;
+ else
+ ref1->inode_list = ref2->inode_list;
+ ref1->count += ref2->count;
+
list_del(&ref2->list);
kfree(ref2);
}
@@ -773,9 +781,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist *refs, struct ulist *roots,
- const u64 *extent_item_pos)
+ u64 time_seq, struct ulist *refs,
+ struct ulist *roots, const u64 *extent_item_pos)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -837,7 +844,7 @@ again:
btrfs_put_delayed_ref(&head->node);
goto again;
}
- ret = __add_delayed_refs(head, delayed_ref_seq,
+ ret = __add_delayed_refs(head, time_seq,
&prefs_delayed);
mutex_unlock(&head->mutex);
if (ret) {
@@ -892,8 +899,7 @@ again:
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
list_del(&ref->list);
- if (ref->count < 0)
- WARN_ON(1);
+ WARN_ON(ref->count < 0);
if (ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
@@ -915,8 +921,8 @@ again:
free_extent_buffer(eb);
}
ret = ulist_add_merge(refs, ref->parent,
- (unsigned long)ref->inode_list,
- (unsigned long *)&eie, GFP_NOFS);
+ (uintptr_t)ref->inode_list,
+ (u64 *)&eie, GFP_NOFS);
if (!ret && extent_item_pos) {
/*
* we've recorded that parent, so we must extend
@@ -960,7 +966,7 @@ static void free_leaf_list(struct ulist *blocks)
while ((node = ulist_next(blocks, &uiter))) {
if (!node->aux)
continue;
- eie = (struct extent_inode_elem *)node->aux;
+ eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
for (; eie; eie = eie_next) {
eie_next = eie->next;
kfree(eie);
@@ -981,8 +987,7 @@ static void free_leaf_list(struct ulist *blocks)
*/
static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist **leafs,
+ u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos)
{
struct ulist *tmp;
@@ -997,7 +1002,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ ret = find_parent_nodes(trans, fs_info, bytenr,
time_seq, *leafs, tmp, extent_item_pos);
ulist_free(tmp);
@@ -1024,8 +1029,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*/
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist **roots)
+ u64 time_seq, struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1043,7 +1047,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
ULIST_ITER_INIT(&uiter);
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ ret = find_parent_nodes(trans, fs_info, bytenr,
time_seq, tmp, *roots, NULL);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
@@ -1111,44 +1115,97 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
found_key);
}
-/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
- struct btrfs_inode_ref *iref,
- struct extent_buffer *eb_in, u64 parent,
- char *dest, u32 size)
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off)
+{
+ int ret, slot;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ key.objectid = inode_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.offset = start_off;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If the item at offset is not found,
+ * btrfs_search_slot will point us to the slot
+ * where it should be inserted. In our case
+ * that will be the slot directly before the
+ * next INODE_REF_KEY_V2 item. In the case
+ * that we're pointing to the last slot in a
+ * leaf, we must move one leaf over.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret) {
+ if (ret >= 1)
+ ret = -ENOENT;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /*
+ * Check that we're still looking at an extended ref key for
+ * this particular objectid. If we have different
+ * objectid or type then there are no more to be found
+ * in the tree and we can exit.
+ */
+ ret = -ENOENT;
+ if (found_key.objectid != inode_objectid)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = 0;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ extref = (struct btrfs_inode_extref *)ptr;
+ *ret_extref = extref;
+ if (found_off)
+ *found_off = found_key.offset;
+ break;
+ }
+
+ return ret;
+}
+
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
{
- u32 len;
int slot;
u64 next_inum;
int ret;
- s64 bytes_left = size - 1;
+ s64 bytes_left = ((s64)size) - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
int leave_spinning = path->leave_spinning;
+ struct btrfs_inode_ref *iref;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
path->leave_spinning = 1;
while (1) {
- len = btrfs_inode_ref_name_len(eb, iref);
- bytes_left -= len;
+ bytes_left -= name_len;
if (bytes_left >= 0)
read_extent_buffer(eb, dest + bytes_left,
- (unsigned long)(iref + 1), len);
+ name_off, name_len);
if (eb != eb_in) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1158,6 +1215,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
ret = -ENOENT;
if (ret)
break;
+
next_inum = found_key.offset;
/* regular exit ahead */
@@ -1173,8 +1231,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
}
btrfs_release_path(path);
-
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+
parent = next_inum;
--bytes_left;
if (bytes_left >= 0)
@@ -1191,12 +1252,39 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ struct btrfs_inode_ref *iref,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
+{
+ return btrfs_ref_to_path(fs_root, path,
+ btrfs_inode_ref_name_len(eb_in, iref),
+ (unsigned long)(iref + 1),
+ eb_in, parent, dest, size);
+}
+
+/*
* this makes the path point to (logical EXTENT_ITEM *)
* returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
* tree blocks and <0 on error.
*/
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
- struct btrfs_path *path, struct btrfs_key *found_key)
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags_ret)
{
int ret;
u64 flags;
@@ -1240,10 +1328,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
(unsigned long long)found_key->objectid,
(unsigned long long)found_key->offset,
(unsigned long long)flags, item_size);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
- return BTRFS_EXTENT_FLAG_TREE_BLOCK;
- if (flags & BTRFS_EXTENT_FLAG_DATA)
- return BTRFS_EXTENT_FLAG_DATA;
+
+ WARN_ON(!flags_ret);
+ if (flags_ret) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else if (flags & BTRFS_EXTENT_FLAG_DATA)
+ *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+ else
+ BUG_ON(1);
+ return 0;
+ }
return -EIO;
}
@@ -1376,11 +1471,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list seq_elem = {};
struct seq_list tree_mod_seq_elem = {};
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
- struct btrfs_delayed_ref_root *delayed_refs = NULL;
pr_debug("resolving all inodes for extent %llu\n",
extent_item_objectid);
@@ -1391,16 +1484,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
- btrfs_get_delayed_seq(delayed_refs, &seq_elem);
- spin_unlock(&delayed_refs->lock);
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+ tree_mod_seq_elem.seq, &refs,
&extent_item_pos);
if (ret)
goto out;
@@ -1408,19 +1496,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
- seq_elem.seq,
- tree_mod_seq_elem.seq, &roots);
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
pr_debug("root %llu references leaf %llu, data list "
- "%#lx\n", root_node->val, ref_node->val,
- ref_node->aux);
- ret = iterate_leaf_refs(
- (struct extent_inode_elem *)ref_node->aux,
- root_node->val, extent_item_objectid,
- iterate, ctx);
+ "%#llx\n", root_node->val, ref_node->val,
+ (long long)ref_node->aux);
+ ret = iterate_leaf_refs((struct extent_inode_elem *)
+ (uintptr_t)ref_node->aux,
+ root_node->val,
+ extent_item_objectid,
+ iterate, ctx);
}
ulist_free(roots);
roots = NULL;
@@ -1431,7 +1519,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- btrfs_put_delayed_seq(delayed_refs, &seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
}
@@ -1444,16 +1531,16 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
{
int ret;
u64 extent_item_pos;
+ u64 flags = 0;
struct btrfs_key found_key;
int search_commit_root = path->search_commit_root;
- ret = extent_from_logical(fs_info, logical, path,
- &found_key);
+ ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
btrfs_release_path(path);
- if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
- ret = -EINVAL;
if (ret < 0)
return ret;
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ return -EINVAL;
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
@@ -1463,9 +1550,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
{
int ret = 0;
int slot;
@@ -1482,7 +1572,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
while (!ret) {
path->leave_spinning = 1;
ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
- &found_key);
+ &found_key);
if (ret < 0)
break;
if (ret) {
@@ -1510,7 +1600,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
"tree %llu\n", cur,
(unsigned long long)found_key.objectid,
(unsigned long long)fs_root->objectid);
- ret = iterate(parent, iref, eb, ctx);
+ ret = iterate(parent, name_len,
+ (unsigned long)(iref + 1), eb, ctx);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -1525,12 +1616,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
+{
+ int ret;
+ int slot;
+ u64 offset = 0;
+ u64 parent;
+ int found = 0;
+ struct extent_buffer *eb;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ u32 cur_offset;
+ unsigned long ptr;
+
+ while (1) {
+ ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+ &offset);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ /* make sure we can use eb after releasing the path */
+ atomic_inc(&eb->refs);
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_release_path(path);
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ u32 name_len;
+
+ extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ ret = iterate(parent, name_len,
+ (unsigned long)&extref->name, eb, ctx);
+ if (ret)
+ break;
+
+ cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += sizeof(*extref);
+ }
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+
+ offset++;
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path, iterate_irefs_t *iterate,
+ void *ctx)
+{
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
+}
+
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
- struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx)
{
struct inode_fs_paths *ipath = ctx;
char *fspath;
@@ -1543,20 +1720,16 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
ipath->fspath->bytes_left - s_ptr : 0;
fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
- fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
- inum, fspath_min, bytes_left);
+ fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+ name_off, eb, inum, fspath_min, bytes_left);
if (IS_ERR(fspath))
return PTR_ERR(fspath);
if (fspath > fspath_min) {
- pr_debug("path resolved: %s\n", fspath);
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
- pr_debug("missed path, not enough space. missing bytes: %lu, "
- "constructed so far: %s\n",
- (unsigned long)(fspath_min - fspath), fspath_min);
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
@@ -1578,7 +1751,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ inode_to_path, ipath);
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -1587,7 +1760,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kmalloc(alloc_bytes, GFP_NOFS);
+ data = vmalloc(alloc_bytes);
if (!data)
return ERR_PTR(-ENOMEM);
@@ -1638,6 +1811,6 @@ void free_ipath(struct inode_fs_paths *ipath)
{
if (!ipath)
return;
- kfree(ipath->fspath);
+ vfree(ipath->fspath);
kfree(ipath);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index c18d8ac7b79..d61feca7945 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -21,6 +21,7 @@
#include "ioctl.h"
#include "ulist.h"
+#include "extent_io.h"
#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
@@ -32,14 +33,13 @@ struct inode_fs_paths {
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
- struct extent_buffer *eb, void *ctx);
int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
- struct btrfs_path *path, struct btrfs_key *found_key);
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_extent_item *ei, u32 item_size,
@@ -58,12 +58,23 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 delayed_ref_seq, u64 time_seq,
- struct ulist **roots);
+ u64 time_seq, struct ulist **roots);
+char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ struct btrfs_inode_ref *iref, struct extent_buffer *eb,
+ u64 parent, char *dest, u32 size);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
void free_ipath(struct inode_fs_paths *ipath);
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off);
+
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 12394a90d60..2a8c242bc4f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,8 @@
#define BTRFS_INODE_DELALLOC_META_RESERVED 4
#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
+#define BTRFS_INODE_NEEDS_FULL_SYNC 7
+#define BTRFS_INODE_COPY_EVERYTHING 8
/* in memory btrfs inode */
struct btrfs_inode {
@@ -87,11 +89,11 @@ struct btrfs_inode {
/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;
- /* the space_info for where this inode's data allocations are done */
- struct btrfs_space_info *space_info;
-
unsigned long runtime_flags;
+ /* Keep track of who's O_SYNC/fsycing currently */
+ atomic_t sync_writers;
+
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
@@ -146,6 +148,9 @@ struct btrfs_inode {
/* flags field from the on disk inode */
u32 flags;
+ /* a local copy of root's last_log_commit */
+ unsigned long last_log_commit;
+
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
@@ -191,26 +196,24 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
BTRFS_I(inode)->disk_i_size = size;
}
-static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
- struct inode *inode)
+static inline bool btrfs_is_free_space_inode(struct inode *inode)
{
- if (root == root->fs_info->tree_root ||
- BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (root == root->fs_info->tree_root &&
+ btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
+ return true;
+ if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
return true;
return false;
}
static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
-
- mutex_lock(&root->log_mutex);
if (BTRFS_I(inode)->logged_trans == generation &&
- BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
- ret = 1;
- mutex_unlock(&root->log_mutex);
- return ret;
+ BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+ return 1;
+ return 0;
}
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index da6e9364a5e..11d47bfb62b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
* the file system was mounted, (i.e., they have been
* referenced by the super block) or they have been
* written since then and the write completion callback
- * was called and a FLUSH request to the device where
- * these blocks are located was received and completed.
+ * was called and no write error was indicated and a
+ * FLUSH request to the device where these blocks are
+ * located was received and completed.
* 2b. All referenced blocks need to have a generation
* number which is equal to the parent's number.
*
@@ -136,7 +137,7 @@ struct btrfsic_block {
unsigned int never_written:1; /* block was added because it was
* referenced, not because it was
* written */
- unsigned int mirror_num:2; /* large enough to hold
+ unsigned int mirror_num; /* large enough to hold
* BTRFS_SUPER_MIRROR_MAX */
struct btrfsic_dev_state *dev_state;
u64 dev_bytenr; /* key, physical byte num on disk */
@@ -722,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -902,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1032,6 +1033,7 @@ continue_with_current_leaf_stack_frame:
struct btrfs_disk_key *disk_key;
u8 type;
u32 item_offset;
+ u32 item_size;
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
@@ -1047,6 +1049,7 @@ leaf_item_out_of_bounce_error:
disk_item_offset,
sizeof(struct btrfs_item));
item_offset = le32_to_cpu(disk_item.offset);
+ item_size = le32_to_cpu(disk_item.size);
disk_key = &disk_item.key;
type = disk_key->type;
@@ -1057,14 +1060,13 @@ leaf_item_out_of_bounce_error:
root_item_offset = item_offset +
offsetof(struct btrfs_leaf, items);
- if (root_item_offset +
- sizeof(struct btrfs_root_item) >
+ if (root_item_offset + item_size >
sf->block_ctx->len)
goto leaf_item_out_of_bounce_error;
btrfsic_read_from_block_data(
sf->block_ctx, &root_item,
root_item_offset,
- sizeof(struct btrfs_root_item));
+ item_size);
next_bytenr = le64_to_cpu(root_item.bytenr);
sf->error =
@@ -1285,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
*next_blockp = NULL;
if (0 == *num_copiesp) {
*num_copiesp =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1487,7 +1489,7 @@ static int btrfsic_handle_extent_data(
chunk_len = num_bytes;
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, state->datablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1580,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_device *device;
length = len;
- ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+ ret = btrfs_map_block(state->root->fs_info, READ,
bytenr, &length, &multi, mirror_num);
+ if (ret) {
+ block_ctx_out->start = 0;
+ block_ctx_out->dev_bytenr = 0;
+ block_ctx_out->len = 0;
+ block_ctx_out->dev = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ return ret;
+ }
+
device = multi->stripes[0].dev;
block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1592,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
block_ctx_out->pagev = NULL;
block_ctx_out->mem_to_free = NULL;
- if (0 == ret)
- kfree(multi);
+ kfree(multi);
if (NULL == block_ctx_out->dev) {
ret = -ENXIO;
printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2461,7 +2474,7 @@ static int btrfsic_process_written_superblock(
}
num_copies =
- btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ btrfs_num_copies(state->root->fs_info,
next_bytenr, BTRFS_SUPER_INFO_SIZE);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2600,6 +2613,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
(unsigned long long)l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
+ } else if (l->block_ref_to->iodone_w_error) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which has write error!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
} else if (l->parent_generation !=
l->block_ref_to->generation &&
BTRFSIC_GENERATION_UNKNOWN !=
@@ -2947,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
struct btrfsic_block_data_ctx block_ctx;
int match = 0;
- num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ num_copies = btrfs_num_copies(state->root->fs_info,
bytenr, state->metablock_size);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 86eff48dab7..94ab2f80e7e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
int ret = -ENOMEM;
+ int faili = 0;
u32 *sums;
tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
- if (!cb->compressed_pages[pg_index])
+ if (!cb->compressed_pages[pg_index]) {
+ faili = pg_index - 1;
+ ret = -ENOMEM;
goto fail2;
+ }
}
+ faili = nr_pages - 1;
cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -682,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ bio_endio(comp_bio, ret);
bio_put(comp_bio);
@@ -707,14 +713,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret)
+ bio_endio(comp_bio, ret);
bio_put(comp_bio);
return 0;
fail2:
- for (pg_index = 0; pg_index < nr_pages; pg_index++)
- free_page((unsigned long)cb->compressed_pages[pg_index]);
+ while (faili >= 0) {
+ __free_page(cb->compressed_pages[faili]);
+ faili--;
+ }
kfree(cb->compressed_pages);
fail1:
@@ -818,6 +827,7 @@ static void free_workspace(int type, struct list_head *workspace)
btrfs_compress_op[idx]->free_workspace(workspace);
atomic_dec(alloc_workspace);
wake:
+ smp_mb();
if (waitqueue_active(workspace_wait))
wake_up(workspace_wait);
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8206b390058..eea5da7a2b9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot,
- int tree_mod_log);
+ struct btrfs_path *path, int level, int slot);
static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -321,7 +320,7 @@ struct tree_mod_root {
struct tree_mod_elem {
struct rb_node node;
u64 index; /* shifted logical */
- struct seq_list elem;
+ u64 seq;
enum mod_log_op op;
/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
@@ -341,20 +340,50 @@ struct tree_mod_elem {
struct tree_mod_root old_root;
};
-static inline void
-__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
{
- elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
- list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ read_lock(&fs_info->tree_mod_log_lock);
}
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+ read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
{
- elem->flags = 1;
+ u64 seq;
+
+ tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
- __get_tree_mod_seq(fs_info, elem);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ }
+ seq = btrfs_inc_tree_mod_seq(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
+ tree_mod_log_write_unlock(fs_info);
+
+ return seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -371,41 +400,40 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
if (!seq_putting)
return;
- BUG_ON(!(elem->flags & 1));
spin_lock(&fs_info->tree_mod_seq_lock);
list_del(&elem->list);
+ elem->seq = 0;
list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+ if (cur_elem->seq < min_seq) {
if (seq_putting > cur_elem->seq) {
/*
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
- goto out;
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return;
}
min_seq = cur_elem->seq;
}
}
+ spin_unlock(&fs_info->tree_mod_seq_lock);
/*
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
- write_lock(&fs_info->tree_mod_log_lock);
+ tree_mod_log_write_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = container_of(node, struct tree_mod_elem, node);
- if (tm->elem.seq > min_seq)
+ if (tm->seq > min_seq)
continue;
rb_erase(node, tm_root);
- list_del(&tm->elem.list);
kfree(tm);
}
- write_unlock(&fs_info->tree_mod_log_lock);
-out:
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ tree_mod_log_write_unlock(fs_info);
}
/*
@@ -423,11 +451,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
struct rb_node **new;
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
- int ret = 0;
- BUG_ON(!tm || !tm->elem.seq);
+ BUG_ON(!tm || !tm->seq);
- write_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
while (*new) {
@@ -437,88 +463,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
new = &((*new)->rb_left);
else if (cur->index > tm->index)
new = &((*new)->rb_right);
- else if (cur->elem.seq < tm->elem.seq)
+ else if (cur->seq < tm->seq)
new = &((*new)->rb_left);
- else if (cur->elem.seq > tm->elem.seq)
+ else if (cur->seq > tm->seq)
new = &((*new)->rb_right);
else {
kfree(tm);
- ret = -EEXIST;
- goto unlock;
+ return -EEXIST;
}
}
rb_link_node(&tm->node, parent, new);
rb_insert_color(&tm->node, tm_root);
-unlock:
- write_unlock(&fs_info->tree_mod_log_lock);
- return ret;
+ return 0;
}
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb) {
smp_mb();
if (list_empty(&(fs_info)->tree_mod_seq_list))
return 1;
- if (!eb)
- return 0;
- if (btrfs_header_level(eb) == 0)
+ if (eb && btrfs_header_level(eb) == 0)
+ return 1;
+
+ tree_mod_log_write_lock(fs_info);
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ /*
+ * someone emptied the list while we were waiting for the lock.
+ * we must not add to the list when no blocker exists.
+ */
+ tree_mod_log_write_unlock(fs_info);
return 1;
+ }
+
return 0;
}
/*
- * This allocates memory and gets a tree modification sequence number when
- * needed.
+ * This allocates memory and gets a tree modification sequence number.
*
- * Returns 0 when no sequence number is needed, < 0 on error.
- * Returns 1 when a sequence number was added. In this case,
- * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
- * after inserting into the rb tree.
+ * Returns <0 on error.
+ * Returns >0 (the added sequence number) on success.
*/
static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
struct tree_mod_elem **tm_ret)
{
struct tree_mod_elem *tm;
- int seq;
-
- if (tree_mod_dont_log(fs_info, NULL))
- return 0;
- tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+ /*
+ * once we switch from spin locks to something different, we should
+ * honor the flags parameter here.
+ */
+ tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
return -ENOMEM;
- tm->elem.flags = 0;
- spin_lock(&fs_info->tree_mod_seq_lock);
- if (list_empty(&fs_info->tree_mod_seq_list)) {
- /*
- * someone emptied the list while we were waiting for the lock.
- * we must not add to the list, because no blocker exists. items
- * are removed from the list only when the existing blocker is
- * removed from the list.
- */
- kfree(tm);
- seq = 0;
- spin_unlock(&fs_info->tree_mod_seq_lock);
- } else {
- __get_tree_mod_seq(fs_info, &tm->elem);
- seq = tm->elem.seq;
- }
-
- return seq;
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+ return tm->seq;
}
-static noinline int
-tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
+static inline int
+__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
{
- struct tree_mod_elem *tm;
int ret;
+ struct tree_mod_elem *tm;
ret = tree_mod_alloc(fs_info, flags, &tm);
- if (ret <= 0)
+ if (ret < 0)
return ret;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
@@ -530,8 +549,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
- ret = __tree_mod_log_insert(fs_info, tm);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ int ret;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return 0;
+
+ ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+
+ tree_mod_log_write_unlock(fs_info);
return ret;
}
@@ -543,6 +576,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
}
static noinline int
+tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op)
+{
+ return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
int nr_items, gfp_t flags)
@@ -554,15 +595,20 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (tree_mod_dont_log(fs_info, eb))
return 0;
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+ ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
MOD_LOG_KEY_REMOVE_WHILE_MOVING);
BUG_ON(ret < 0);
}
ret = tree_mod_alloc(fs_info, flags, &tm);
- if (ret <= 0)
- return ret;
+ if (ret < 0)
+ goto out;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
tm->slot = src_slot;
@@ -571,10 +617,29 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
tm->op = MOD_LOG_MOVE_KEYS;
ret = __tree_mod_log_insert(fs_info, tm);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+ tree_mod_log_write_unlock(fs_info);
return ret;
}
+static inline void
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+ int i;
+ u32 nritems;
+ int ret;
+
+ if (btrfs_header_level(eb) == 0)
+ return;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+ BUG_ON(ret < 0);
+ }
+}
+
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
@@ -583,9 +648,12 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct tree_mod_elem *tm;
int ret;
+ if (tree_mod_dont_log(fs_info, NULL))
+ return 0;
+
ret = tree_mod_alloc(fs_info, flags, &tm);
- if (ret <= 0)
- return ret;
+ if (ret < 0)
+ goto out;
tm->index = new_root->start >> PAGE_CACHE_SHIFT;
tm->old_root.logical = old_root->start;
@@ -594,7 +662,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
tm->op = MOD_LOG_ROOT_REPLACE;
ret = __tree_mod_log_insert(fs_info, tm);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+out:
+ tree_mod_log_write_unlock(fs_info);
return ret;
}
@@ -608,7 +677,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
struct tree_mod_elem *found = NULL;
u64 index = start >> PAGE_CACHE_SHIFT;
- read_lock(&fs_info->tree_mod_log_lock);
+ tree_mod_log_read_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
@@ -617,18 +686,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
node = node->rb_left;
} else if (cur->index > index) {
node = node->rb_right;
- } else if (cur->elem.seq < min_seq) {
+ } else if (cur->seq < min_seq) {
node = node->rb_left;
} else if (!smallest) {
/* we want the node with the highest seq */
if (found)
- BUG_ON(found->elem.seq > cur->elem.seq);
+ BUG_ON(found->seq > cur->seq);
found = cur;
node = node->rb_left;
- } else if (cur->elem.seq > min_seq) {
+ } else if (cur->seq > min_seq) {
/* we want the node with the smallest seq */
if (found)
- BUG_ON(found->elem.seq < cur->elem.seq);
+ BUG_ON(found->seq < cur->seq);
found = cur;
node = node->rb_right;
} else {
@@ -636,7 +705,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
break;
}
}
- read_unlock(&fs_info->tree_mod_log_lock);
+ tree_mod_log_read_unlock(fs_info);
return found;
}
@@ -664,7 +733,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
return __tree_mod_log_search(fs_info, start, min_seq, 0);
}
-static inline void
+static noinline void
tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
struct extent_buffer *src, unsigned long dst_offset,
unsigned long src_offset, int nr_items)
@@ -675,18 +744,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
if (tree_mod_dont_log(fs_info, NULL))
return;
- if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
+ tree_mod_log_write_unlock(fs_info);
return;
+ }
- /* speed this up by single seq for all operations? */
for (i = 0; i < nr_items; i++) {
- ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
- MOD_LOG_KEY_REMOVE);
+ ret = tree_mod_log_insert_key_locked(fs_info, src,
+ i + src_offset,
+ MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
- ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
- MOD_LOG_KEY_ADD);
+ ret = tree_mod_log_insert_key_locked(fs_info, dst,
+ i + dst_offset,
+ MOD_LOG_KEY_ADD);
BUG_ON(ret < 0);
}
+
+ tree_mod_log_write_unlock(fs_info);
}
static inline void
@@ -699,10 +773,9 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
BUG_ON(ret < 0);
}
-static inline void
+static noinline void
tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int slot, int atomic)
+ struct extent_buffer *eb, int slot, int atomic)
{
int ret;
@@ -712,30 +785,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
BUG_ON(ret < 0);
}
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+static noinline void
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
- int i;
- int ret;
- u32 nritems;
-
if (tree_mod_dont_log(fs_info, eb))
return;
- nritems = btrfs_header_nritems(eb);
- for (i = nritems - 1; i >= 0; i--) {
- ret = tree_mod_log_insert_key(fs_info, eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING);
- BUG_ON(ret < 0);
- }
+ __tree_mod_log_free_eb(fs_info, eb);
+
+ tree_mod_log_write_unlock(fs_info);
}
-static inline void
+static noinline void
tree_mod_log_set_root_pointer(struct btrfs_root *root,
struct extent_buffer *new_root_node)
{
int ret;
- tree_mod_log_free_eb(root->fs_info, root->node);
ret = tree_mod_log_insert_root(root->fs_info, root->node,
new_root_node, GFP_NOFS);
BUG_ON(ret < 0);
@@ -862,12 +927,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1, 1);
BUG_ON(ret); /* -ENOMEM */
}
- /*
- * don't log freeing in case we're freeing the root node, this
- * is done by tree_mod_log_set_root_pointer later
- */
- if (buf != root->node && btrfs_header_level(buf) != 0)
- tree_mod_log_free_eb(root->fs_info, buf);
+ tree_mod_log_free_eb(root->fs_info, buf);
clean_tree_block(trans, root, buf);
*last_ref = 1;
}
@@ -1069,7 +1129,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
- while (tm && tm->elem.seq >= time_seq) {
+ while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
* the modification. as we're going backwards, we do the
@@ -1161,6 +1221,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
free_extent_buffer(eb);
__tree_mod_log_rewind(eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
return eb_rewin;
}
@@ -1177,9 +1239,11 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
struct extent_buffer *eb;
+ struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
+ u32 blocksize;
eb = btrfs_read_lock_root_node(root);
tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
@@ -1195,14 +1259,32 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
}
tm = tree_mod_log_search(root->fs_info, logical, time_seq);
- if (old_root)
+ if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ blocksize = btrfs_level_size(root, old_root->level);
+ old = read_tree_block(root, logical, blocksize, 0);
+ if (!old) {
+ pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
+ logical);
+ WARN_ON(1);
+ } else {
+ eb = btrfs_clone_extent_buffer(old);
+ free_extent_buffer(old);
+ }
+ } else if (old_root) {
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
eb = alloc_dummy_extent_buffer(logical, root->nodesize);
- else
+ } else {
eb = btrfs_clone_extent_buffer(root->node);
- btrfs_tree_read_unlock(root->node);
- free_extent_buffer(root->node);
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ }
+
if (!eb)
return NULL;
+ extent_buffer_get(eb);
btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
@@ -1215,11 +1297,28 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
__tree_mod_log_rewind(eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
- extent_buffer_get(eb);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
return eb;
}
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+
+ tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+ level = tm->old_root.level;
+ } else {
+ rcu_read_lock();
+ level = btrfs_header_level(root->node);
+ rcu_read_unlock();
+ }
+
+ return level;
+}
+
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -1260,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
u64 search_start;
int ret;
- if (trans->transaction != root->fs_info->running_transaction) {
- printk(KERN_CRIT "trans %llu running %llu\n",
+ if (trans->transaction != root->fs_info->running_transaction)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
(unsigned long long)trans->transid,
(unsigned long long)
root->fs_info->running_transaction->transid);
- WARN_ON(1);
- }
- if (trans->transid != root->fs_info->generation) {
- printk(KERN_CRIT "trans %llu running %llu\n",
+
+ if (trans->transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
(unsigned long long)trans->transid,
(unsigned long long)root->fs_info->generation);
- WARN_ON(1);
- }
if (!should_cow_block(trans, root, buf)) {
*cow_ret = buf;
@@ -1368,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (cache_only && parent_level != 1)
return 0;
- if (trans->transaction != root->fs_info->running_transaction)
- WARN_ON(1);
- if (trans->transid != root->fs_info->generation)
- WARN_ON(1);
+ WARN_ON(trans->transaction != root->fs_info->running_transaction);
+ WARN_ON(trans->transid != root->fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1661,6 +1755,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
+ tree_mod_log_free_eb(root->fs_info, root->node);
tree_mod_log_set_root_pointer(root, child);
rcu_assign_pointer(root->node, child);
@@ -1725,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+ del_ptr(trans, root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
@@ -1734,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &right_key, pslot + 1, 0);
+ pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1769,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- del_ptr(trans, root, path, level + 1, pslot, 1);
+ del_ptr(trans, root, path, level + 1, pslot);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
@@ -1778,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+ tree_mod_log_set_node_key(root->fs_info, parent,
pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -1878,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &disk_key, pslot, 0);
+ pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -1931,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
tree_mod_log_set_node_key(root->fs_info, parent,
- &disk_key, pslot + 1, 0);
+ pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2117,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
int no_skips = 0;
struct extent_buffer *t;
+ if (path->really_keep_locks)
+ return;
+
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
@@ -2164,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
int i;
- if (path->keep_locks)
+ if (path->keep_locks || path->really_keep_locks)
return;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2397,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (!cow)
write_lock_level = -1;
- if (cow && (p->keep_locks || p->lowest_level))
+ if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
min_write_lock_level = write_lock_level;
@@ -2466,7 +2564,10 @@ again:
* must have write locks on this node and the
* parent
*/
- if (level + 1 > write_lock_level) {
+ if (level > write_lock_level ||
+ (level + 1 > write_lock_level &&
+ level + 1 < BTRFS_MAX_LEVEL &&
+ p->nodes[level + 1])) {
write_lock_level = level + 1;
btrfs_release_path(p);
goto again;
@@ -2722,6 +2823,80 @@ done:
}
/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int find_higher, int return_any)
+{
+ int ret;
+ struct extent_buffer *leaf;
+
+again:
+ ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+ if (ret <= 0)
+ return ret;
+ /*
+ * a return value of 1 means the path is at the position where the
+ * item should be inserted. Normally this is the next bigger item,
+ * but in case the previous item is the last in a leaf, path points
+ * to the first free slot in the previous leaf, i.e. at an invalid
+ * item.
+ */
+ leaf = p->nodes[0];
+
+ if (find_higher) {
+ if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, p);
+ if (ret <= 0)
+ return ret;
+ if (!return_any)
+ return 1;
+ /*
+ * no higher item found, return the next
+ * lower instead
+ */
+ return_any = 0;
+ find_higher = 0;
+ btrfs_release_path(p);
+ goto again;
+ }
+ } else {
+ if (p->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, p);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ p->slots[0] = btrfs_header_nritems(leaf) - 1;
+ return 0;
+ }
+ if (!return_any)
+ return 1;
+ /*
+ * no lower item found, return the next
+ * higher instead
+ */
+ return_any = 0;
+ find_higher = 1;
+ btrfs_release_path(p);
+ goto again;
+ } else {
+ --p->slots[0];
+ }
+ }
+ return 0;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -2741,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+ tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -2832,8 +3007,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
- tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
- src_nritems - push_items);
+ /*
+ * don't call tree_mod_log_eb_move here, key removal was already
+ * fully logged by tree_mod_log_eb_copy above.
+ */
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
(src_nritems - push_items) *
@@ -3124,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
+ struct btrfs_item *start_item;
+ struct btrfs_item *end_item;
+ struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- data_len = btrfs_item_end_nr(l, start);
- data_len = data_len - btrfs_item_offset_nr(l, end);
+ btrfs_init_map_token(&token);
+ start_item = btrfs_item_nr(l, start);
+ end_item = btrfs_item_nr(l, end);
+ data_len = btrfs_token_item_offset(l, start_item, &token) +
+ btrfs_token_item_size(l, start_item, &token);
+ data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3225,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (push_items == 0)
goto out_unlock;
- if (!empty && push_items == left_nritems)
- WARN_ON(1);
+ WARN_ON(!empty && push_items == left_nritems);
/* push left to right */
right_nritems = btrfs_header_nritems(right);
@@ -3464,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(left, old_left_nritems + push_items);
/* fixup right node */
- if (push_items > right_nritems) {
- printk(KERN_CRIT "push items %d nr %u\n", push_items,
+ if (push_items > right_nritems)
+ WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
right_nritems);
- WARN_ON(1);
- }
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4264,149 +4445,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
}
/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
-{
- struct extent_buffer *leaf;
- struct btrfs_item *item;
- int ret = 0;
- int slot;
- int i;
- u32 nritems;
- u32 total_data = 0;
- u32 total_size = 0;
- unsigned int data_end;
- struct btrfs_disk_key disk_key;
- struct btrfs_key found_key;
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token);
-
- for (i = 0; i < nr; i++) {
- if (total_size + data_size[i] + sizeof(struct btrfs_item) >
- BTRFS_LEAF_DATA_SIZE(root)) {
- break;
- nr = i;
- }
- total_data += data_size[i];
- total_size += data_size[i] + sizeof(struct btrfs_item);
- }
- BUG_ON(nr == 0);
-
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
- if (ret == 0)
- return -EEXIST;
- if (ret < 0)
- goto out;
-
- leaf = path->nodes[0];
-
- nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(root, leaf);
-
- if (btrfs_leaf_free_space(root, leaf) < total_size) {
- for (i = nr; i >= 0; i--) {
- total_data -= data_size[i];
- total_size -= data_size[i] + sizeof(struct btrfs_item);
- if (total_size < btrfs_leaf_free_space(root, leaf))
- break;
- }
- nr = i;
- }
-
- slot = path->slots[0];
- BUG_ON(slot < 0);
-
- if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
- item = btrfs_item_nr(leaf, slot);
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
- /* figure out how many keys we can insert in here */
- total_data = data_size[0];
- for (i = 1; i < nr; i++) {
- if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
- break;
- total_data += data_size[i];
- }
- nr = i;
-
- if (old_data < data_end) {
- btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
- slot, old_data, data_end);
- BUG_ON(1);
- }
- /*
- * item0..itemN ... dataN.offset..dataN.size .. data0.size
- */
- /* first correct the data pointers */
- for (i = slot; i < nritems; i++) {
- u32 ioff;
-
- item = btrfs_item_nr(leaf, i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
- }
- /* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
- btrfs_item_nr_offset(slot),
- (nritems - slot) * sizeof(struct btrfs_item));
-
- /* shift the data */
- memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
- data_end - total_data, btrfs_leaf_data(leaf) +
- data_end, old_data - data_end);
- data_end = old_data;
- } else {
- /*
- * this sucks but it has to be done, if we are inserting at
- * the end of the leaf only insert 1 of the items, since we
- * have no way of knowing whats on the next leaf and we'd have
- * to drop our current locks to figure it out
- */
- nr = 1;
- }
-
- /* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
- btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(leaf, slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
- data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
- }
- btrfs_set_header_nritems(leaf, nritems + nr);
- btrfs_mark_buffer_dirty(leaf);
-
- ret = 0;
- if (slot == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(trans, root, path, &disk_key, 1);
- }
-
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
- BUG();
- }
-out:
- if (!ret)
- ret = nr;
- return ret;
-}
-
-/*
* this is a helper for btrfs_insert_empty_items, the main goal here is
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
@@ -4567,8 +4605,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* empty a node.
*/
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot,
- int tree_mod_log)
+ struct btrfs_path *path, int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
@@ -4576,7 +4613,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
- if (tree_mod_log && level)
+ if (level)
tree_mod_log_eb_move(root->fs_info, parent, slot,
slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
@@ -4584,7 +4621,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
- } else if (tree_mod_log && level) {
+ } else if (level) {
ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
@@ -4621,7 +4658,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(trans, root, path, 1, path->slots[1], 1);
+ del_ptr(trans, root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -4931,6 +4968,434 @@ out:
return ret;
}
+static void tree_move_down(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level)
+{
+ BUG_ON(*level == 0);
+ path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+ path->slots[*level]);
+ path->slots[*level - 1] = 0;
+ (*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level)
+{
+ int ret = 0;
+ int nritems;
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+
+ path->slots[*level]++;
+
+ while (path->slots[*level] >= nritems) {
+ if (*level == root_level)
+ return -1;
+
+ /* move upnext */
+ path->slots[*level] = 0;
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ (*level)++;
+ path->slots[*level]++;
+
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level,
+ int allow_down,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ if (*level == 0 || !allow_down) {
+ ret = tree_move_next_or_upnext(root, path, level, root_level);
+ } else {
+ tree_move_down(root, path, level, root_level);
+ ret = 0;
+ }
+ if (ret >= 0) {
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ }
+ return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ char *tmp_buf)
+{
+ int cmp;
+ int len1, len2;
+ unsigned long off1, off2;
+
+ len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+ right_path->slots[0]);
+
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+ if (cmp)
+ return 1;
+ return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t changed_cb, void *ctx)
+{
+ int ret;
+ int cmp;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *left_path = NULL;
+ struct btrfs_path *right_path = NULL;
+ struct btrfs_key left_key;
+ struct btrfs_key right_key;
+ char *tmp_buf = NULL;
+ int left_root_level;
+ int right_root_level;
+ int left_level;
+ int right_level;
+ int left_end_reached;
+ int right_end_reached;
+ int advance_left;
+ int advance_right;
+ u64 left_blockptr;
+ u64 right_blockptr;
+ u64 left_start_ctransid;
+ u64 right_start_ctransid;
+ u64 ctransid;
+
+ left_path = btrfs_alloc_path();
+ if (!left_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ right_path = btrfs_alloc_path();
+ if (!right_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ left_path->search_commit_root = 1;
+ left_path->skip_locking = 1;
+ right_path->search_commit_root = 1;
+ right_path->skip_locking = 1;
+
+ spin_lock(&left_root->root_item_lock);
+ left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
+ spin_unlock(&left_root->root_item_lock);
+
+ spin_lock(&right_root->root_item_lock);
+ right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
+ spin_unlock(&right_root->root_item_lock);
+
+ trans = btrfs_join_transaction(left_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ /*
+ * Strategy: Go to the first items of both trees. Then do
+ *
+ * If both trees are at level 0
+ * Compare keys of current items
+ * If left < right treat left item as new, advance left tree
+ * and repeat
+ * If left > right treat right item as deleted, advance right tree
+ * and repeat
+ * If left == right do deep compare of items, treat as changed if
+ * needed, advance both trees and repeat
+ * If both trees are at the same level but not at level 0
+ * Compare keys of current nodes/leafs
+ * If left < right advance left tree and repeat
+ * If left > right advance right tree and repeat
+ * If left == right compare blockptrs of the next nodes/leafs
+ * If they match advance both trees but stay at the same level
+ * and repeat
+ * If they don't match advance both trees while allowing to go
+ * deeper and repeat
+ * If tree levels are different
+ * Advance the tree that needs it and repeat
+ *
+ * Advancing a tree means:
+ * If we are at level 0, try to go to the next slot. If that's not
+ * possible, go one level up and repeat. Stop when we found a level
+ * where we could go to the next slot. We may at this point be on a
+ * node or a leaf.
+ *
+ * If we are not at level 0 and not on shared tree blocks, go one
+ * level deeper.
+ *
+ * If we are not at level 0 and on shared tree blocks, go one slot to
+ * the right if possible or go up and right.
+ */
+
+ left_level = btrfs_header_level(left_root->commit_root);
+ left_root_level = left_level;
+ left_path->nodes[left_level] = left_root->commit_root;
+ extent_buffer_get(left_path->nodes[left_level]);
+
+ right_level = btrfs_header_level(right_root->commit_root);
+ right_root_level = right_level;
+ right_path->nodes[right_level] = right_root->commit_root;
+ extent_buffer_get(right_path->nodes[right_level]);
+
+ if (left_level == 0)
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ else
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ if (right_level == 0)
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+ else
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+
+ left_end_reached = right_end_reached = 0;
+ advance_left = advance_right = 0;
+
+ while (1) {
+ /*
+ * We need to make sure the transaction does not get committed
+ * while we do anything on commit roots. This means, we need to
+ * join and leave transactions for every item that we process.
+ */
+ if (trans && btrfs_should_end_transaction(trans, left_root)) {
+ btrfs_release_path(left_path);
+ btrfs_release_path(right_path);
+
+ ret = btrfs_end_transaction(trans, left_root);
+ trans = NULL;
+ if (ret < 0)
+ goto out;
+ }
+ /* now rejoin the transaction */
+ if (!trans) {
+ trans = btrfs_join_transaction(left_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ spin_lock(&left_root->root_item_lock);
+ ctransid = btrfs_root_ctransid(&left_root->root_item);
+ spin_unlock(&left_root->root_item_lock);
+ if (ctransid != left_start_ctransid)
+ left_start_ctransid = 0;
+
+ spin_lock(&right_root->root_item_lock);
+ ctransid = btrfs_root_ctransid(&right_root->root_item);
+ spin_unlock(&right_root->root_item_lock);
+ if (ctransid != right_start_ctransid)
+ right_start_ctransid = 0;
+
+ if (!left_start_ctransid || !right_start_ctransid) {
+ WARN(1, KERN_WARNING
+ "btrfs: btrfs_compare_tree detected "
+ "a change in one of the trees while "
+ "iterating. This is probably a "
+ "bug.\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * the commit root may have changed, so start again
+ * where we stopped
+ */
+ left_path->lowest_level = left_level;
+ right_path->lowest_level = right_level;
+ ret = btrfs_search_slot(NULL, left_root,
+ &left_key, left_path, 0, 0);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_search_slot(NULL, right_root,
+ &right_key, right_path, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (advance_left && !left_end_reached) {
+ ret = tree_advance(left_root, left_path, &left_level,
+ left_root_level,
+ advance_left != ADVANCE_ONLY_NEXT,
+ &left_key);
+ if (ret < 0)
+ left_end_reached = ADVANCE;
+ advance_left = 0;
+ }
+ if (advance_right && !right_end_reached) {
+ ret = tree_advance(right_root, right_path, &right_level,
+ right_root_level,
+ advance_right != ADVANCE_ONLY_NEXT,
+ &right_key);
+ if (ret < 0)
+ right_end_reached = ADVANCE;
+ advance_right = 0;
+ }
+
+ if (left_end_reached && right_end_reached) {
+ ret = 0;
+ goto out;
+ } else if (left_end_reached) {
+ if (right_level == 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_right = ADVANCE;
+ continue;
+ } else if (right_end_reached) {
+ if (left_level == 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ continue;
+ }
+
+ if (left_level == 0 && right_level == 0) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_right = ADVANCE;
+ } else {
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = tree_compare_item(left_root, left_path,
+ right_path, tmp_buf);
+ if (ret) {
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_CHANGED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ } else if (left_level == right_level) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ advance_right = ADVANCE;
+ } else {
+ left_blockptr = btrfs_node_blockptr(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_blockptr = btrfs_node_blockptr(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr) {
+ /*
+ * As we're on a shared block, don't
+ * allow to go deeper.
+ */
+ advance_left = ADVANCE_ONLY_NEXT;
+ advance_right = ADVANCE_ONLY_NEXT;
+ } else {
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ }
+ } else if (left_level < right_level) {
+ advance_right = ADVANCE;
+ } else {
+ advance_left = ADVANCE;
+ }
+ }
+
+out:
+ btrfs_free_path(left_path);
+ btrfs_free_path(right_path);
+ kfree(tmp_buf);
+
+ if (trans) {
+ if (!ret)
+ ret = btrfs_end_transaction(trans, left_root);
+ else
+ btrfs_end_transaction(trans, left_root);
+ }
+
+ return ret;
+}
+
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
@@ -5033,6 +5498,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
return btrfs_next_old_leaf(root, path, 0);
}
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+ int i;
+
+ for (i = 0; i < level; i++) {
+ path->slots[i] = 0;
+ if (!path->nodes[i])
+ continue;
+ if (path->locks[i]) {
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+}
+
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int del)
+{
+ struct extent_buffer *b;
+ struct btrfs_key key;
+ u32 nritems;
+ int level = 1;
+ int slot;
+ int ret = 1;
+ int write_lock_level = BTRFS_MAX_LEVEL;
+ int ins_len = del ? -1 : 0;
+
+ WARN_ON(!(path->keep_locks || path->really_keep_locks));
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+ while (path->nodes[level]) {
+ nritems = btrfs_header_nritems(path->nodes[level]);
+ if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path,
+ ins_len, 1);
+ if (ret < 0)
+ goto out;
+ level = 1;
+ continue;
+ }
+
+ if (path->slots[level] >= nritems - 1) {
+ level++;
+ continue;
+ }
+
+ btrfs_release_level(path, level);
+ break;
+ }
+
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto out;
+ }
+
+ path->slots[level]++;
+ b = path->nodes[level];
+
+ while (b) {
+ level = btrfs_header_level(b);
+
+ if (!should_cow_block(trans, root, b))
+ goto cow_done;
+
+ btrfs_set_path_blocking(path);
+ ret = btrfs_cow_block(trans, root, b,
+ path->nodes[level + 1],
+ path->slots[level + 1], &b);
+ if (ret)
+ goto out;
+cow_done:
+ path->nodes[level] = b;
+ btrfs_clear_path_blocking(path, NULL, 0);
+ if (level != 0) {
+ ret = setup_nodes_for_search(trans, root, path, b,
+ level, ins_len,
+ &write_lock_level);
+ if (ret == -EAGAIN)
+ goto search;
+ if (ret)
+ goto out;
+
+ b = path->nodes[level];
+ slot = path->slots[level];
+
+ ret = read_block_for_search(trans, root, path,
+ &b, level, slot, &key, 0);
+ if (ret == -EAGAIN)
+ goto search;
+ if (ret)
+ goto out;
+ level = btrfs_header_level(b);
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(path, b,
+ BTRFS_WRITE_LOCK);
+ }
+ path->locks[level] = BTRFS_WRITE_LOCK;
+ path->nodes[level] = b;
+ path->slots[level] = 0;
+ } else {
+ path->slots[level] = 0;
+ ret = 0;
+ break;
+ }
+ }
+
+out:
+ if (ret)
+ btrfs_release_path(path);
+
+ return ret;
+}
+
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
@@ -5127,6 +5725,7 @@ again:
* locked. To solve this situation, we give up
* on our lock and cycle.
*/
+ free_extent_buffer(next);
btrfs_release_path(path);
cond_resched();
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa5c45b3907..547b7b05727 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
#define BTRFS_MAX_LEVEL 8
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -113,7 +116,7 @@ struct btrfs_ordered_sum;
#define BTRFS_FREE_SPACE_OBJECTID -11ULL
/*
- * The inode number assigned to the special inode for sotring
+ * The inode number assigned to the special inode for storing
* free ino cache
*/
#define BTRFS_FREE_INO_OBJECTID -12ULL
@@ -139,6 +142,8 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+#define BTRFS_DEV_REPLACE_DEVID 0
+
/*
* the max metadata block size. This limit is somewhat artificial,
* but the memmove costs go through the roof for larger blocks.
@@ -151,6 +156,13 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_NAME_LEN 255
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
@@ -162,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS (1 << 30)
+
#define BTRFS_FT_UNKNOWN 0
#define BTRFS_FT_REG_FILE 1
#define BTRFS_FT_DIR 2
@@ -403,7 +418,7 @@ struct btrfs_root_backup {
__le64 bytes_used;
__le64 num_devices;
/* future */
- __le64 unsed_64[4];
+ __le64 unused_64[4];
u8 tree_root_level;
u8 chunk_root_level;
@@ -486,6 +501,8 @@ struct btrfs_super_block {
*/
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
+
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -493,7 +510,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -558,6 +576,7 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
+ unsigned int really_keep_locks:1;
};
/*
@@ -640,6 +659,14 @@ struct btrfs_inode_ref {
/* name goes here */
} __attribute__ ((__packed__));
+struct btrfs_inode_extref {
+ __le64 parent_objectid;
+ __le64 index;
+ __le16 name_len;
+ __u8 name[0];
+ /* name goes here */
+} __attribute__ ((__packed__));
+
struct btrfs_timespec {
__le64 sec;
__le32 nsec;
@@ -709,6 +736,36 @@ struct btrfs_root_item {
struct btrfs_disk_key drop_progress;
u8 drop_level;
u8 level;
+
+ /*
+ * The following fields appear after subvol_uuids+subvol_times
+ * were introduced.
+ */
+
+ /*
+ * This generation number is used to test if the new fields are valid
+ * and up to date while reading the root item. Everytime the root item
+ * is written out, the "generation" field is copied into this field. If
+ * anyone ever mounted the fs with an older kernel, we will have
+ * mismatching generation values here and thus must invalidate the
+ * new fields. See btrfs_update_root and btrfs_find_last_root for
+ * details.
+ * the offset of generation_v2 is also used as the start for the memset
+ * when invalidating the fields.
+ */
+ __le64 generation_v2;
+ u8 uuid[BTRFS_UUID_SIZE];
+ u8 parent_uuid[BTRFS_UUID_SIZE];
+ u8 received_uuid[BTRFS_UUID_SIZE];
+ __le64 ctransid; /* updated when an inode changes */
+ __le64 otransid; /* trans when created */
+ __le64 stransid; /* trans when sent. non-zero for received subvol */
+ __le64 rtransid; /* trans when received. non-zero for received subvol */
+ struct btrfs_timespec ctime;
+ struct btrfs_timespec otime;
+ struct btrfs_timespec stime;
+ struct btrfs_timespec rtime;
+ __le64 reserved[8]; /* for future */
} __attribute__ ((__packed__));
/*
@@ -834,6 +891,59 @@ struct btrfs_dev_stats_item {
__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
} __attribute__ ((__packed__));
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
+
+struct btrfs_dev_replace {
+ u64 replace_state; /* see #define above */
+ u64 time_started; /* seconds since 1-Jan-1970 */
+ u64 time_stopped; /* seconds since 1-Jan-1970 */
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ u64 cont_reading_from_srcdev_mode; /* see #define above */
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ pid_t lock_owner;
+ atomic_t nesting_level;
+ struct mutex lock_finishing_cancel_unmount;
+ struct mutex lock_management_lock;
+ struct mutex lock;
+
+ struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 src_devid;
+ __le64 cursor_left;
+ __le64 cursor_right;
+ __le64 cont_reading_from_srcdev_mode;
+
+ __le64 replace_state;
+ __le64 time_started;
+ __le64 time_stopped;
+ __le64 num_write_errors;
+ __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -883,6 +993,72 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
+/*
+ * SCANNING is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION 1
+
+struct btrfs_qgroup_status_item {
+ __le64 version;
+ /*
+ * the generation is updated during every commit. As older
+ * versions of btrfs are not aware of qgroups, it will be
+ * possible to detect inconsistencies by checking the
+ * generation on mount time
+ */
+ __le64 generation;
+
+ /* flag definitions see above */
+ __le64 flags;
+
+ /*
+ * only used during scanning to record the progress
+ * of the scan. It contains a logical address
+ */
+ __le64 scan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+ __le64 generation;
+ __le64 rfer;
+ __le64 rfer_cmpr;
+ __le64 excl;
+ __le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+ /*
+ * only updated when any of the other values change
+ */
+ __le64 flags;
+ __le64 max_rfer;
+ __le64 max_excl;
+ __le64 rsv_rfer;
+ __le64 rsv_excl;
+} __attribute__ ((__packed__));
+
struct btrfs_space_info {
u64 flags;
@@ -929,12 +1105,22 @@ struct btrfs_space_info {
wait_queue_head_t wait;
};
+#define BTRFS_BLOCK_RSV_GLOBAL 1
+#define BTRFS_BLOCK_RSV_DELALLOC 2
+#define BTRFS_BLOCK_RSV_TRANS 3
+#define BTRFS_BLOCK_RSV_CHUNK 4
+#define BTRFS_BLOCK_RSV_DELOPS 5
+#define BTRFS_BLOCK_RSV_EMPTY 6
+#define BTRFS_BLOCK_RSV_TEMP 7
+
struct btrfs_block_rsv {
u64 size;
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned int full;
+ unsigned short full;
+ unsigned short type;
+ unsigned short failfast;
};
/*
@@ -1028,8 +1214,18 @@ struct btrfs_block_group_cache {
* Today it will only have one thing on it, but that may change
*/
struct list_head cluster_list;
+
+ /* For delayed block group creation */
+ struct list_head new_bg_list;
};
+/* delayed seq elem */
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+/* fs_info */
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
@@ -1044,6 +1240,7 @@ struct btrfs_fs_info {
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
struct btrfs_root *csum_root;
+ struct btrfs_root *quota_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1133,7 +1330,6 @@ struct btrfs_fs_info {
struct mutex reloc_mutex;
struct list_head trans_list;
- struct list_head hashers;
struct list_head dead_roots;
struct list_head caching_block_groups;
@@ -1144,6 +1340,7 @@ struct btrfs_fs_info {
spinlock_t tree_mod_seq_lock;
atomic_t tree_mod_seq;
struct list_head tree_mod_seq_list;
+ struct seq_list tree_mod_seq_elem;
/* this protects tree_mod_log */
rwlock_t tree_mod_log_lock;
@@ -1195,6 +1392,7 @@ struct btrfs_fs_info {
struct btrfs_workers generic_worker;
struct btrfs_workers workers;
struct btrfs_workers delalloc_workers;
+ struct btrfs_workers flush_workers;
struct btrfs_workers endio_workers;
struct btrfs_workers endio_meta_workers;
struct btrfs_workers endio_meta_write_workers;
@@ -1240,6 +1438,8 @@ struct btrfs_fs_info {
*/
struct list_head space_info;
+ struct btrfs_space_info *data_sinfo;
+
struct reloc_control *reloc_ctl;
spinlock_t delalloc_lock;
@@ -1256,9 +1456,6 @@ struct btrfs_fs_info {
struct rb_root defrag_inodes;
atomic_t defrag_running;
- spinlock_t ref_cache_lock;
- u64 total_ref_cache_size;
-
/*
* these three are in extended format (availability of single
* chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1292,10 +1489,35 @@ struct btrfs_fs_info {
struct rw_semaphore scrub_super_lock;
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
+ struct btrfs_workers scrub_wr_completion_workers;
+ struct btrfs_workers scrub_nocow_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
+ /*
+ * quota information
+ */
+ unsigned int quota_enabled:1;
+
+ /*
+ * quota_enabled only changes state after a commit. This holds the
+ * next state.
+ */
+ unsigned int pending_quota_state:1;
+
+ /* is qgroup tracking in a consistent state? */
+ u64 qgroup_flags;
+
+ /* holds configuration and tracking. Protected by qgroup_lock */
+ struct rb_root qgroup_tree;
+ spinlock_t qgroup_lock;
+
+ /* list of dirty qgroups to be written at next commit */
+ struct list_head dirty_qgroups;
+
+ /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ u64 qgroup_seq;
/* filesystem state */
u64 fs_state;
@@ -1308,6 +1530,13 @@ struct btrfs_fs_info {
/* next backup root to be overwritten */
int backup_root_index;
+
+ int num_tolerated_disk_barrier_failures;
+
+ /* device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ atomic_t mutually_exclusive_operation_running;
};
/*
@@ -1348,9 +1577,9 @@ struct btrfs_root {
wait_queue_head_t log_commit_wait[2];
atomic_t log_writers;
atomic_t log_commit[2];
+ atomic_t log_batch;
unsigned long log_transid;
unsigned long last_log_commit;
- unsigned long log_batch;
pid_t log_start_pid;
bool log_multiple_pids;
@@ -1416,6 +1645,8 @@ struct btrfs_root {
dev_t anon_dev;
int force_cow;
+
+ spinlock_t root_item_lock;
};
struct btrfs_ioctl_defrag_range_args {
@@ -1457,6 +1688,7 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_INODE_ITEM_KEY 1
#define BTRFS_INODE_REF_KEY 12
+#define BTRFS_INODE_EXTREF_KEY 13
#define BTRFS_XATTR_ITEM_KEY 24
#define BTRFS_ORPHAN_ITEM_KEY 48
/* reserve 2-15 close to the inode for later flexibility */
@@ -1525,6 +1757,30 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY 240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY 242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY 244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY 246
+
#define BTRFS_BALANCE_ITEM_KEY 248
/*
@@ -1534,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_STATS_KEY 249
/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY 250
+
+/*
* string items are for debugging. They just store a short string of
* data in the FS
*/
@@ -1598,7 +1860,7 @@ struct btrfs_map_token {
static inline void btrfs_init_map_token (struct btrfs_map_token *token)
{
- memset(token, 0, sizeof(*token));
+ token->kaddr = NULL;
}
/* some macros to generate set/get funcs for the struct fields. This
@@ -1621,13 +1883,54 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
offsetof(type, member), \
sizeof(((type *)0)->member)))
-#ifndef BTRFS_SETGET_FUNCS
+#define DECLARE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, \
+ struct btrfs_map_token *token); \
+void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val, \
+ struct btrfs_map_token *token); \
+static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off) \
+{ \
+ return btrfs_get_token_##bits(eb, ptr, off, NULL); \
+} \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val) \
+{ \
+ btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
+}
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
-u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
-u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \
-void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
-void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
-#endif
+static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+} \
+static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
+ u##bits val) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+} \
+static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+ struct btrfs_map_token *token) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+} \
+static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
+ type *s, u##bits val, \
+ struct btrfs_map_token *token) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \
@@ -1778,6 +2081,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
/* struct btrfs_inode_item */
BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -2189,6 +2499,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+ generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+ ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+ otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+ stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+ rtransid, 64);
static inline bool btrfs_root_readonly(struct btrfs_root *root)
{
@@ -2465,6 +2785,92 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+ version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
+ scan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+ rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+ excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+ struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+ rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+ struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+ excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+ struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+ max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+ max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+ rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+ rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
@@ -2605,10 +3011,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+ /*
+ * Flushing delalloc may cause deadlock somewhere, in this
+ * case, use FLUSH LIMIT
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+ BTRFS_RESERVE_FLUSH_ALL,
+};
+
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2622,24 +3041,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes);
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved);
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes);
@@ -2661,6 +3075,9 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2680,6 +3097,21 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_key *max_key,
struct btrfs_path *path, int cache_only,
u64 min_trans);
+enum btrfs_compare_tree_result {
+ BTRFS_COMPARE_TREE_NEW,
+ BTRFS_COMPARE_TREE_DELETED,
+ BTRFS_COMPARE_TREE_CHANGED,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ struct btrfs_key *key,
+ enum btrfs_compare_tree_result result,
+ void *ctx);
+int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t cb, void *ctx);
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
@@ -2711,6 +3143,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
ins_len, int cow);
int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int find_higher, int return_any);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, int cache_only, u64 *last_ret,
@@ -2753,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int del);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -2793,11 +3231,23 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->chunk_root);
kfree(fs_info->dev_root);
kfree(fs_info->csum_root);
+ kfree(fs_info->quota_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic_inc_return(&fs_info->tree_mod_seq);
+}
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
@@ -2819,6 +3269,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_root_item *item);
+void btrfs_read_root_item(struct btrfs_root *root,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item);
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
@@ -2826,8 +3279,12 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
/* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
int name_len, struct inode *dir,
@@ -2884,12 +3341,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod,
+ u64 *ret_index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -2897,13 +3354,26 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+ u64 ref_objectid, const char *name,
+ int name_len,
+ struct btrfs_inode_extref **extref_ret);
+
/* file-item.c */
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 logical_offset, u32 *dst);
+ struct bio *bio, u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2914,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
@@ -2929,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
/* inode.c */
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ int wait;
+ int delay_iput;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create);
@@ -2961,6 +3445,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
@@ -2995,6 +3481,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
@@ -3020,16 +3508,30 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space);
+
/* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
- int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+ u64 start, u64 end, int skip_pinned,
+ int modified);
extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
- u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
@@ -3053,14 +3555,48 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+#endif
+
+__printf(5, 6)
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ }
+}
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
#define btrfs_abort_transaction(trans, root, errno) \
do { \
__btrfs_abort_transaction(trans, root, __func__, \
@@ -3080,6 +3616,7 @@ do { \
(errno), fmt, ##args); \
} while (0)
+__printf(5, 6)
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
@@ -3127,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
- struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
void btrfs_scrub_pause(struct btrfs_root *root);
void btrfs_scrub_pause_super(struct btrfs_root *root);
void btrfs_scrub_continue(struct btrfs_root *root);
void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+ struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
@@ -3156,17 +3694,49 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
-/* delayed seq elem */
-struct seq_list {
+/* qgroup.c */
+struct qgroup_update {
struct list_head list;
- u64 seq;
- u32 flags;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_delayed_extent_op *extent_op;
};
-void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2399f408691..34836036f01 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
int __init btrfs_delayed_inode_init(void)
{
- delayed_node_cache = kmem_cache_create("delayed_node",
+ delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
sizeof(struct btrfs_delayed_node),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -62,6 +62,7 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
delayed_node->bytes_reserved = 0;
+ memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
}
static inline int btrfs_is_continuous_delayed_item(
@@ -511,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
rb_erase(&delayed_item->rb_node, root);
delayed_item->delayed_node->count--;
- atomic_dec(&delayed_root->items);
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+ if (atomic_dec_return(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND &&
waitqueue_active(&delayed_root->wait))
wake_up(&delayed_root->wait);
}
@@ -649,8 +650,9 @@ static int btrfs_delayed_inode_reserve_metadata(
* we're accounted for.
*/
if (!src_rsv || (!trans->bytes_reserved &&
- src_rsv != &root->fs_info->delalloc_block_rsv)) {
- ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* Since we're under a transaction reserve_metadata_bytes could
* try to commit the transaction which will make it return
@@ -667,7 +669,7 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes, 1);
}
return ret;
- } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+ } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
spin_lock(&BTRFS_I(inode)->lock);
if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
&BTRFS_I(inode)->runtime_flags)) {
@@ -685,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
* reserve something strictly for us. If not be a pain and try
* to steal from the delalloc block rsv.
*/
- ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret)
goto out;
@@ -1027,9 +1030,10 @@ do_again:
btrfs_release_delayed_item(prev);
ret = 0;
btrfs_release_path(path);
- if (curr)
+ if (curr) {
+ mutex_unlock(&node->mutex);
goto do_again;
- else
+ } else
goto delete_fail;
}
@@ -1054,8 +1058,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
delayed_node->count--;
delayed_root = delayed_node->root->fs_info->delayed_root;
- atomic_dec(&delayed_root->items);
- if (atomic_read(&delayed_root->items) <
+ if (atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND &&
waitqueue_active(&delayed_root->wait))
wake_up(&delayed_root->wait);
@@ -1113,8 +1116,8 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* Returns < 0 on error and returns with an aborted transaction with any
* outstanding delayed items cleaned up.
*/
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr)
{
struct btrfs_root *curr_root = root;
struct btrfs_delayed_root *delayed_root;
@@ -1122,6 +1125,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
+ bool count = (nr > 0);
if (trans->aborted)
return -EIO;
@@ -1137,7 +1141,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
delayed_root = btrfs_get_delayed_root(root);
curr_node = btrfs_first_delayed_node(delayed_root);
- while (curr_node) {
+ while (curr_node && (!count || (count && nr--))) {
curr_root = curr_node->root;
ret = btrfs_insert_delayed_items(trans, path, curr_root,
curr_node);
@@ -1149,6 +1153,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
path, curr_node);
if (ret) {
btrfs_release_delayed_node(curr_node);
+ curr_node = NULL;
btrfs_abort_transaction(trans, root, ret);
break;
}
@@ -1158,12 +1163,26 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
btrfs_release_delayed_node(prev_node);
}
+ if (curr_node)
+ btrfs_release_delayed_node(curr_node);
btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
}
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_run_delayed_items(trans, root, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr)
+{
+ return __btrfs_run_delayed_items(trans, root, nr);
+}
+
static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
@@ -1238,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
struct btrfs_delayed_node *delayed_node = NULL;
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
- unsigned long nr = 0;
int need_requeue = 0;
int ret;
@@ -1299,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
delayed_node);
mutex_unlock(&delayed_node->mutex);
- nr = trans->blocks_used;
-
trans->block_rsv = block_rsv;
btrfs_end_transaction_dmeta(trans, root);
- __btrfs_btree_balance_dirty(root, nr);
+ btrfs_btree_balance_dirty_nodelay(root);
free_path:
btrfs_free_path(path);
out:
@@ -1698,8 +1714,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
struct inode *inode)
{
- btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
- btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
@@ -1747,8 +1763,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode_item = &delayed_node->inode_item;
- inode->i_uid = btrfs_stack_inode_uid(inode_item);
- inode->i_gid = btrfs_stack_inode_gid(inode_item);
+ i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f5aa4023d3e..4f808e1baee 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -107,6 +107,8 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode);
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr);
void btrfs_balance_delayed_items(struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae7b04790..ae941177339 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -38,17 +38,14 @@
static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
struct btrfs_delayed_tree_ref *ref1)
{
- if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
- if (ref1->root < ref2->root)
- return -1;
- if (ref1->root > ref2->root)
- return 1;
- } else {
- if (ref1->parent < ref2->parent)
- return -1;
- if (ref1->parent > ref2->parent)
- return 1;
- }
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
return 0;
}
@@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
* type of the delayed backrefs and content of delayed backrefs.
*/
static int comp_entry(struct btrfs_delayed_ref_node *ref2,
- struct btrfs_delayed_ref_node *ref1)
+ struct btrfs_delayed_ref_node *ref1,
+ bool compare_seq)
{
if (ref1->bytenr < ref2->bytenr)
return -1;
@@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
if (ref1->type > ref2->type)
return 1;
/* merging of sequenced refs is not allowed */
- if (ref1->seq < ref2->seq)
- return -1;
- if (ref1->seq > ref2->seq)
- return 1;
+ if (compare_seq) {
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
+ }
if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
rb_node);
- cmp = comp_entry(entry, ins);
+ cmp = comp_entry(entry, ins, 1);
if (cmp < 0)
p = &(*p)->rb_left;
else if (cmp > 0)
@@ -233,22 +233,134 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq)
+static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *ref)
{
- struct seq_list *elem;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ ref->in_tree = 0;
+ btrfs_put_delayed_ref(ref);
+ delayed_refs->num_entries--;
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+}
- assert_spin_locked(&delayed_refs->lock);
- if (list_empty(&delayed_refs->seq_head))
- return 0;
+static int merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+ struct rb_node *node;
+ int merged = 0;
+ int mod = 0;
+ int done = 0;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ struct btrfs_delayed_ref_node *next;
+
+ next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_prev(node);
+ if (next->bytenr != ref->bytenr)
+ break;
+ if (seq && next->seq >= seq)
+ break;
+ if (comp_entry(ref, next, 0))
+ continue;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ struct btrfs_delayed_ref_node *tmp;
- elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
- if (seq >= elem->seq) {
- pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
- seq, elem->seq, delayed_refs);
- return 1;
+ tmp = ref;
+ ref = next;
+ next = tmp;
+ done = 1;
+ }
+ mod = -next->ref_mod;
+ }
+
+ merged++;
+ drop_delayed_ref(trans, delayed_refs, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, ref);
+ break;
+ } else {
+ /*
+ * You can't have multiples of the same ref on a tree
+ * block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+
+ if (done)
+ break;
+ node = rb_prev(&ref->rb_node);
}
- return 0;
+
+ return merged;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct rb_node *node;
+ u64 seq = 0;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ node = rb_prev(&head->node.rb_node);
+ while (node) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr != head->node.bytenr)
+ break;
+
+ /* We can't merge refs that are outside of our seq count */
+ if (seq && ref->seq >= seq)
+ break;
+ if (merge_ref(trans, delayed_refs, ref, seq))
+ node = rb_prev(&head->node.rb_node);
+ else
+ node = rb_prev(node);
+ }
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq)
+{
+ struct seq_list *elem;
+ int ret = 0;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ if (seq >= elem->seq) {
+ pr_debug("holding back delayed_ref %llu, lowest is "
+ "%llu (%p)\n", seq, elem->seq, delayed_refs);
+ ret = 1;
+ }
+ }
+
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
}
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
@@ -332,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
* every changing the extent allocation tree.
*/
existing->ref_mod--;
- if (existing->ref_mod == 0) {
- rb_erase(&existing->rb_node,
- &delayed_refs->root);
- existing->in_tree = 0;
- btrfs_put_delayed_ref(existing);
- delayed_refs->num_entries--;
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
- } else {
+ if (existing->ref_mod == 0)
+ drop_delayed_ref(trans, delayed_refs, existing);
+ else
WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
- }
} else {
WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -525,8 +630,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (is_fstree(ref_root))
- seq = inc_delayed_seq(delayed_refs);
+ if (need_ref_seq(for_cow, ref_root))
+ seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -584,8 +689,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (is_fstree(ref_root))
- seq = inc_delayed_seq(delayed_refs);
+ if (need_ref_seq(for_cow, ref_root))
+ seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -658,10 +763,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
for_cow);
- if (!is_fstree(ref_root) &&
- waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+ if (need_ref_seq(for_cow, ref_root))
+ btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -707,10 +811,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
action, for_cow);
- if (!is_fstree(ref_root) &&
- waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+ if (need_ref_seq(for_cow, ref_root))
+ btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -736,8 +839,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
- if (waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 413927fb995..c9d703693df 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,7 +18,7 @@
#ifndef __DELAYED_REF__
#define __DELAYED_REF__
-/* these are the possible values of struct btrfs_delayed_ref->action */
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
@@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
-
- /*
- * seq number of delayed refs. We need to know if a backref was being
- * added before the currently processed ref or afterwards.
- */
- u64 seq;
-
- /*
- * seq_list holds a list of all seq numbers that are currently being
- * added to the list. While walking backrefs (btrfs_find_all_roots,
- * qgroups), which might take some time, no newer ref must be processed,
- * as it might influence the outcome of the walk.
- */
- struct list_head seq_head;
-
- /*
- * when the only refs we have in the list must not be processed, we want
- * to wait for more refs to show up or for the end of backref walking.
- */
- wait_queue_head_t seq_wait;
};
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -187,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
@@ -195,34 +179,28 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
-static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
-{
- assert_spin_locked(&delayed_refs->lock);
- ++delayed_refs->seq;
- return delayed_refs->seq;
-}
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq);
-static inline void
-btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- struct seq_list *elem)
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
{
- assert_spin_locked(&delayed_refs->lock);
- elem->seq = delayed_refs->seq;
- list_add_tail(&elem->list, &delayed_refs->seq_head);
-}
+ if (for_cow)
+ return 0;
-static inline void
-btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- struct seq_list *elem)
-{
- spin_lock(&delayed_refs->lock);
- list_del(&elem->list);
- wake_up(&delayed_refs->seq_wait);
- spin_unlock(&delayed_refs->lock);
-}
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return 1;
-int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
- u64 seq);
+ if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+
+ return 0;
+}
/*
* a node might live in a head or a regular ref, this lets you
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 00000000000..66dbc8dbddf
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ int item_size;
+ struct btrfs_dev_replace_item *ptr;
+ u64 src_devid;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+no_valid_dev_replace_entry_found:
+ ret = 0;
+ dev_replace->replace_state =
+ BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+ dev_replace->cont_reading_from_srcdev_mode =
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+ dev_replace->replace_state = 0;
+ dev_replace->time_started = 0;
+ dev_replace->time_stopped = 0;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ dev_replace->is_valid = 0;
+ dev_replace->item_needs_writeback = 0;
+ goto out;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+ if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+ pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+ goto no_valid_dev_replace_entry_found;
+ }
+
+ src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+ dev_replace->cont_reading_from_srcdev_mode =
+ btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+ dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+ dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+ dev_replace->time_stopped =
+ btrfs_dev_replace_time_stopped(eb, ptr);
+ atomic64_set(&dev_replace->num_write_errors,
+ btrfs_dev_replace_num_write_errors(eb, ptr));
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+ btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+ dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+ dev_replace->committed_cursor_left = dev_replace->cursor_left;
+ dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+ dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+ dev_replace->is_valid = 1;
+
+ dev_replace->item_needs_writeback = 0;
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+ NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info,
+ BTRFS_DEV_REPLACE_DEVID,
+ NULL, NULL);
+ /*
+ * allow 'btrfs dev replace_cancel' if src/tgt device is
+ * missing
+ */
+ if (!dev_replace->srcdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+ (unsigned long long)src_devid);
+ }
+ if (!dev_replace->tgtdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+ (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+ }
+ if (dev_replace->tgtdev) {
+ if (dev_replace->srcdev) {
+ dev_replace->tgtdev->total_bytes =
+ dev_replace->srcdev->total_bytes;
+ dev_replace->tgtdev->disk_total_bytes =
+ dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->bytes_used =
+ dev_replace->srcdev->bytes_used;
+ }
+ dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+ btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+ dev_replace->tgtdev);
+ }
+ break;
+ }
+
+out:
+ if (path)
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_replace_item *ptr;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (!dev_replace->is_valid ||
+ !dev_replace->item_needs_writeback) {
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+ ret);
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /*
+ * need to delete old one and insert a new one.
+ * Since no attempt is made to recover any old state, if the
+ * dev_replace state is 'running', the data on the target
+ * drive is lost.
+ * It would be possible to recover the state: just make sure
+ * that the beginning of the item is never changed and always
+ * contains all the essential information. Then read this
+ * minimal set of information and use it as a base for the
+ * new state.
+ */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+ ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ pr_warn("btrfs: insert dev_replace item failed %d!\n",
+ ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_dev_replace_item);
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (dev_replace->srcdev)
+ btrfs_set_dev_replace_src_devid(eb, ptr,
+ dev_replace->srcdev->devid);
+ else
+ btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+ btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+ dev_replace->cont_reading_from_srcdev_mode);
+ btrfs_set_dev_replace_replace_state(eb, ptr,
+ dev_replace->replace_state);
+ btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+ btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+ btrfs_set_dev_replace_num_write_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_write_errors));
+ btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+ dev_replace->cursor_left_last_write_of_item =
+ dev_replace->cursor_left;
+ btrfs_set_dev_replace_cursor_left(eb, ptr,
+ dev_replace->cursor_left_last_write_of_item);
+ btrfs_set_dev_replace_cursor_right(eb, ptr,
+ dev_replace->cursor_right);
+ dev_replace->item_needs_writeback = 0;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ dev_replace->committed_cursor_left =
+ dev_replace->cursor_left_last_write_of_item;
+}
+
+static u64 btrfs_get_seconds_since_1970(void)
+{
+ struct timespec t = CURRENT_TIME_SEC;
+
+ return t.tv_sec;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ mutex_lock(&fs_info->volume_mutex);
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ &tgt_device);
+ if (ret) {
+ pr_err("btrfs: target device %s is invalid!\n",
+ args->start.tgtdev_name);
+ mutex_unlock(&fs_info->volume_mutex);
+ return -EINVAL;
+ }
+
+ ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+ args->start.srcdev_name,
+ &src_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret) {
+ ret = -EINVAL;
+ goto leave_no_lock;
+ }
+
+ if (tgt_device->total_bytes < src_device->total_bytes) {
+ pr_err("btrfs: target device is smaller than source device!\n");
+ ret = -EINVAL;
+ goto leave_no_lock;
+ }
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ goto leave;
+ }
+
+ dev_replace->cont_reading_from_srcdev_mode =
+ args->start.cont_reading_from_srcdev_mode;
+ WARN_ON(!src_device);
+ dev_replace->srcdev = src_device;
+ WARN_ON(!tgt_device);
+ dev_replace->tgtdev = tgt_device;
+
+ printk_in_rcu(KERN_INFO
+ "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+
+ tgt_device->total_bytes = src_device->total_bytes;
+ tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+ tgt_device->bytes_used = src_device->bytes_used;
+
+ /*
+ * from now on, the writes to the srcdev are all duplicated to
+ * go to the tgtdev as well (refer to btrfs_map_block()).
+ */
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ dev_replace->time_started = btrfs_get_seconds_since_1970();
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->is_valid = 1;
+ dev_replace->item_needs_writeback = 1;
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_wait_ordered_extents(root, 0);
+
+ /* force writing the updated state information to disk */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_dev_replace_lock(dev_replace);
+ goto leave;
+ }
+
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* the disk copy procedure reuses the scrub code */
+ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+ src_device->total_bytes,
+ &dev_replace->scrub_progress, 0, 1);
+
+ ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+ WARN_ON(ret);
+
+ return 0;
+
+leave:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ return ret;
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device;
+ struct btrfs_device *src_device;
+ struct btrfs_root *root = fs_info->tree_root;
+ u8 uuid_tmp[BTRFS_UUID_SIZE];
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ /* don't allow cancel or unmount to disturb the finishing procedure */
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* was the operation canceled, or is it finished? */
+ if (dev_replace->replace_state !=
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return 0;
+ }
+
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret)
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+
+ /*
+ * flush all outstanding I/O and inode extent mappings before the
+ * copy operation is declared as being finished
+ */
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ btrfs_dev_replace_lock(dev_replace);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+ : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+
+ if (scrub_ret) {
+ printk_in_rcu(KERN_ERR
+ "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+ }
+
+ printk_in_rcu(KERN_INFO
+ "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+ tgt_device->is_tgtdev_for_dev_replace = 0;
+ tgt_device->devid = src_device->devid;
+ src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+ tgt_device->bytes_used = src_device->bytes_used;
+ memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+ memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+ memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+ tgt_device->total_bytes = src_device->total_bytes;
+ tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+ tgt_device->bytes_used = src_device->bytes_used;
+ if (fs_info->sb->s_bdev == src_device->bdev)
+ fs_info->sb->s_bdev = tgt_device->bdev;
+ if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+ fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+ list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+ btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ if (src_device->bdev) {
+ /* zero out the old super */
+ btrfs_scratch_superblock(src_device);
+ }
+ /*
+ * this is again a consistent state where no dev_replace procedure
+ * is running, the target device is part of the filesystem, the
+ * source device is not part of the filesystem anymore and its 1st
+ * superblock is scratched out so that it is no longer marked to
+ * belong to this filesystem.
+ */
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ /* write back the superblocks */
+ trans = btrfs_start_transaction(root, 0);
+ if (!IS_ERR(trans))
+ btrfs_commit_transaction(trans, root);
+
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device)
+{
+ int ret;
+
+ if (srcdevid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+ device);
+ }
+ return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* even if !dev_replace_is_valid, the values are good enough for
+ * the replace_status ioctl */
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ args->status.replace_state = dev_replace->replace_state;
+ args->status.time_started = dev_replace->time_started;
+ args->status.time_stopped = dev_replace->time_stopped;
+ args->status.num_write_errors =
+ atomic64_read(&dev_replace->num_write_errors);
+ args->status.num_uncorrectable_read_errors =
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ args->status.progress_1000 = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ args->status.progress_1000 = 1000;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+ div64_u64(dev_replace->srcdev->total_bytes, 1000));
+ break;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ args->result = __btrfs_dev_replace_cancel(fs_info);
+ return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = fs_info->tree_root;
+ u64 result;
+ int ret;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ btrfs_dev_replace_unlock(dev_replace);
+ goto leave;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ tgt_device = dev_replace->tgtdev;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ break;
+ }
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(dev_replace);
+ btrfs_scrub_cancel(fs_info);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->item_needs_writeback = 1;
+ pr_info("btrfs: suspending dev_replace for unmount\n");
+ break;
+ }
+
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ break;
+ }
+ if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+ pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+ "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ WARN_ON(atomic_xchg(
+ &fs_info->mutually_exclusive_operation_running, 1));
+ task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+ return PTR_RET(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_ioctl_dev_replace_args *status_args;
+ u64 progress;
+
+ status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+ if (status_args) {
+ btrfs_dev_replace_status(fs_info, status_args);
+ progress = status_args->status.progress_1000;
+ kfree(status_args);
+ do_div(progress, 10);
+ printk_in_rcu(KERN_INFO
+ "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ dev_replace->srcdev->missing ? "<missing disk>" :
+ rcu_str_deref(dev_replace->srcdev->name),
+ dev_replace->srcdev->devid,
+ dev_replace->tgtdev ?
+ rcu_str_deref(dev_replace->tgtdev->name) :
+ "<missing target disk>",
+ (unsigned int)progress);
+ }
+ btrfs_dev_replace_continue_on_mount(fs_info);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+ return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+
+ ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+ dev_replace->committed_cursor_left,
+ dev_replace->srcdev->total_bytes,
+ &dev_replace->scrub_progress, 0, 1);
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
+ WARN_ON(ret);
+ return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+ if (!dev_replace->is_valid)
+ return 0;
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * return true even if tgtdev is missing (this is
+ * something that can happen if the dev_replace
+ * procedure is suspended by an umount and then
+ * the tgtdev is missing (or "btrfs dev scan") was
+ * not called and the the filesystem is remounted
+ * in degraded state. This does not stop the
+ * dev_replace procedure. It needs to be canceled
+ * manually if the cancelation is wanted.
+ */
+ break;
+ }
+ return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+ /* the beginning is just an optimization for the typical case */
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+ /* this is not a nested case where the same thread
+ * is trying to acqurire the same lock twice */
+ mutex_lock(&dev_replace->lock);
+ mutex_lock(&dev_replace->lock_management_lock);
+ dev_replace->lock_owner = current->pid;
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_lock(&dev_replace->lock_management_lock);
+ if (atomic_read(&dev_replace->nesting_level) > 0 &&
+ dev_replace->lock_owner == current->pid) {
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_unlock(&dev_replace->lock_management_lock);
+ goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ mutex_lock(&dev_replace->lock_management_lock);
+ WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+ WARN_ON(dev_replace->lock_owner != current->pid);
+ atomic_dec(&dev_replace->nesting_level);
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+ dev_replace->lock_owner = 0;
+ mutex_unlock(&dev_replace->lock_management_lock);
+ mutex_unlock(&dev_replace->lock);
+ } else {
+ mutex_unlock(&dev_replace->lock_management_lock);
+ }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 00000000000..20035cbbf02
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+ atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696..502c2158167 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ int data_size;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_path *path;
+
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ /* return back any errors */
+ if (ret < 0)
+ goto out;
+
+ /* nothing found, we're safe */
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ /* we found an item, look for our name in the item */
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di) {
+ /* our exact name was found */
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * see if there is room in the item to insert this
+ * name
+ */
+ data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (data_size + btrfs_item_size_nr(leaf, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+ ret = -EOVERFLOW;
+ } else {
+ /* plenty of insertion room */
+ ret = 0;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* lookup a directory item based on index. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2936ca49b3b..a8f652dc940 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,11 @@
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "dev-replace.h"
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -217,26 +222,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
- u64 failed_start = em->start;
- u64 failed_len = em->len;
-
free_extent_map(em);
em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- ret = 0;
- } else {
- em = lookup_extent_mapping(em_tree, failed_start,
- failed_len);
- ret = -EIO;
- }
+ if (!em)
+ em = ERR_PTR(-EIO);
} else if (ret) {
free_extent_map(em);
- em = NULL;
+ em = ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
- if (ret)
- em = ERR_PTR(ret);
out:
return em;
}
@@ -377,9 +372,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE,
btree_get_extent, mirror_num);
- if (!ret && !verify_parent_transid(io_tree, eb,
+ if (!ret) {
+ if (!verify_parent_transid(io_tree, eb,
parent_transid, 0))
- break;
+ break;
+ else
+ ret = -EIO;
+ }
/*
* This buffer's crc is fine, but its contents are corrupted, so
@@ -389,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
break;
- num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ num_copies = btrfs_num_copies(root->fs_info,
eb->start, eb->len);
if (num_copies == 1)
break;
@@ -407,7 +406,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
break;
}
- if (failed && !ret)
+ if (failed && !ret && failed_mirror)
repair_eb_io_failure(root, eb, failed_mirror);
return ret;
@@ -435,10 +434,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
WARN_ON(1);
return 0;
}
- if (eb->pages[0] != page) {
- WARN_ON(1);
- return 0;
- }
if (!PageUptodate(page)) {
WARN_ON(1);
return 0;
@@ -754,9 +749,7 @@ static void run_one_async_done(struct btrfs_work *work)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
- atomic_dec(&fs_info->nr_async_submits);
-
- if (atomic_read(&fs_info->nr_async_submits) < limit &&
+ if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -860,21 +853,37 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ int ret;
+
/*
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
+}
+
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+ if (bio_flags & EXTENT_BIO_TREE_LOG)
+ return 0;
+#ifdef CONFIG_X86
+ if (cpu_has_xmm4_2)
+ return 0;
+#endif
+ return 1;
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ int async = check_async_write(inode, bio_flags);
int ret;
if (!(rw & REQ_WRITE)) {
-
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -882,20 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
bio, 1);
if (ret)
- return ret;
- return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
- mirror_num, 0);
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else if (!async) {
+ ret = btree_csum_one_bio(bio);
+ if (ret)
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else {
+ /*
+ * kthread helpers are used to submit writes so that
+ * checksumming can happen in parallel across all CPUs
+ */
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ bio_offset,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
}
- /*
- * kthread helpers are used to submit writes so that checksumming
- * can happen in parallel across all CPUs
- */
- return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, rw, bio, mirror_num, 0,
- bio_offset,
- __btree_submit_bio_start,
- __btree_submit_bio_done);
+ if (ret) {
+out_w_error:
+ bio_endio(bio, ret);
+ }
+ return ret;
}
#ifdef CONFIG_MIGRATION
@@ -980,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
static int btree_set_page_dirty(struct page *page)
{
+#ifdef DEBUG
struct extent_buffer *eb;
BUG_ON(!PagePrivate(page));
@@ -988,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
+#endif
return __set_page_dirty_nobuffers(page);
}
@@ -1114,16 +1137,16 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
spin_unlock(&root->fs_info->delalloc_lock);
btrfs_panic(root->fs_info, -EOVERFLOW,
"Can't clear %lu bytes from "
- " dirty_mdatadata_bytes (%lu)",
+ " dirty_mdatadata_bytes (%llu)",
buf->len,
root->fs_info->dirty_metadata_bytes);
}
spin_unlock(&root->fs_info->delalloc_lock);
- }
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
- clear_extent_buffer_dirty(buf);
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
+ clear_extent_buffer_dirty(buf);
+ }
}
}
@@ -1166,8 +1189,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+ atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
- root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
@@ -1182,6 +1205,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
+
+ spin_lock_init(&root->root_item_lock);
}
static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -1225,6 +1250,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
return root;
}
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int ret = 0;
+ u64 bytenr;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, objectid);
+ root->root_key.objectid = objectid;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+
+ leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+ 0, objectid, NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ goto fail;
+ }
+
+ bytenr = leaf->start;
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, objectid);
+ root->node = leaf;
+
+ write_extent_buffer(leaf, fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(leaf),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+ BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ root->commit_root = btrfs_root_node(root);
+ root->track_dirty = 1;
+
+
+ root->root_item.flags = 0;
+ root->root_item.byte_limit = 0;
+ btrfs_set_root_bytenr(&root->root_item, leaf->start);
+ btrfs_set_root_generation(&root->root_item, trans->transid);
+ btrfs_set_root_level(&root->root_item, 0);
+ btrfs_set_root_refs(&root->root_item, 1);
+ btrfs_set_root_used(&root->root_item, leaf->len);
+ btrfs_set_root_last_snapshot(&root->root_item, 0);
+ btrfs_set_root_dirid(&root->root_item, 0);
+ root->root_item.drop_level = 0;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+ if (ret)
+ goto fail;
+
+ btrfs_tree_unlock(leaf);
+
+fail:
+ if (ret)
+ return ERR_PTR(ret);
+
+ return root;
+}
+
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1326,6 +1427,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
u64 generation;
u32 blocksize;
int ret = 0;
+ int slot;
root = btrfs_alloc_root(fs_info);
if (!root)
@@ -1352,9 +1454,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
if (ret == 0) {
l = path->nodes[0];
- read_extent_buffer(l, &root->root_item,
- btrfs_item_ptr_offset(l, path->slots[0]),
- sizeof(root->root_item));
+ slot = path->slots[0];
+ btrfs_read_root_item(tree_root, l, slot, &root->root_item);
memcpy(&root->root_key, location, sizeof(*location));
}
btrfs_free_path(path);
@@ -1396,6 +1497,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->dev_root;
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
return fs_info->csum_root;
+ if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return fs_info->quota_root ? fs_info->quota_root :
+ ERR_PTR(-ENOENT);
again:
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
@@ -1533,8 +1637,6 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
do {
- vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
-
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
mutex_trylock(&root->fs_info->cleaner_mutex)) {
btrfs_run_delayed_iputs(root);
@@ -1566,7 +1668,6 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
delay = HZ * 30;
- vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
@@ -1587,9 +1688,10 @@ static int transaction_kthread(void *arg)
spin_unlock(&root->fs_info->trans_lock);
/* If the file system is aborted, this will always fail. */
- trans = btrfs_join_transaction(root);
+ trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
- cannot_commit = true;
+ if (PTR_ERR(trans) != -ENOENT)
+ cannot_commit = true;
goto sleep;
}
if (transid == trans->transid) {
@@ -1823,6 +1925,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_extent_buffer(info->extent_root->commit_root);
free_extent_buffer(info->csum_root->node);
free_extent_buffer(info->csum_root->commit_root);
+ if (info->quota_root) {
+ free_extent_buffer(info->quota_root->node);
+ free_extent_buffer(info->quota_root->commit_root);
+ }
info->tree_root->node = NULL;
info->tree_root->commit_root = NULL;
@@ -1832,6 +1938,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
info->extent_root->commit_root = NULL;
info->csum_root->node = NULL;
info->csum_root->commit_root = NULL;
+ if (info->quota_root) {
+ info->quota_root->node = NULL;
+ info->quota_root->commit_root = NULL;
+ }
if (chunk_root) {
free_extent_buffer(info->chunk_root->node);
@@ -1862,6 +1972,7 @@ int open_ctree(struct super_block *sb,
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
+ struct btrfs_root *quota_root;
struct btrfs_root *log_tree_root;
int ret;
int err = -EINVAL;
@@ -1873,9 +1984,10 @@ int open_ctree(struct super_block *sb,
csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
+ quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
if (!tree_root || !extent_root || !csum_root ||
- !chunk_root || !dev_root) {
+ !chunk_root || !dev_root || !quota_root) {
err = -ENOMEM;
goto fail;
}
@@ -1904,13 +2016,11 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
- INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -1924,12 +2034,15 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
btrfs_mapping_init(&fs_info->mapping_tree);
- btrfs_init_block_rsv(&fs_info->global_block_rsv);
- btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
- btrfs_init_block_rsv(&fs_info->trans_block_rsv);
- btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
- btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv,
+ BTRFS_BLOCK_RSV_GLOBAL);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+ BTRFS_BLOCK_RSV_DELALLOC);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+ BTRFS_BLOCK_RSV_DELOPS);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -2031,6 +2144,18 @@ int open_ctree(struct super_block *sb,
init_rwsem(&fs_info->extent_commit_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
+ fs_info->dev_replace.lock_owner = 0;
+ atomic_set(&fs_info->dev_replace.nesting_level, 0);
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ mutex_init(&fs_info->dev_replace.lock_management_lock);
+ mutex_init(&fs_info->dev_replace.lock);
+
+ spin_lock_init(&fs_info->qgroup_lock);
+ fs_info->qgroup_tree = RB_ROOT;
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ fs_info->qgroup_seq = 1;
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2172,6 +2297,10 @@ int open_ctree(struct super_block *sb,
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
+
btrfs_init_workers(&fs_info->submit_workers, "submit",
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
@@ -2243,8 +2372,9 @@ int open_ctree(struct super_block *sb,
ret |= btrfs_start_workers(&fs_info->delayed_workers);
ret |= btrfs_start_workers(&fs_info->caching_workers);
ret |= btrfs_start_workers(&fs_info->readahead_workers);
+ ret |= btrfs_start_workers(&fs_info->flush_workers);
if (ret) {
- ret = -ENOMEM;
+ err = -ENOMEM;
goto fail_sb_buffer;
}
@@ -2311,7 +2441,11 @@ int open_ctree(struct super_block *sb,
goto fail_tree_roots;
}
- btrfs_close_extra_devices(fs_devices);
+ /*
+ * keep the device that is marked to be the target device for the
+ * dev_replace procedure
+ */
+ btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2356,6 +2490,17 @@ retry_root_backup:
goto recovery_tree_root;
csum_root->track_dirty = 1;
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_QUOTA_TREE_OBJECTID, quota_root);
+ if (ret) {
+ kfree(quota_root);
+ quota_root = fs_info->quota_root = NULL;
+ } else {
+ quota_root->track_dirty = 1;
+ fs_info->quota_enabled = 1;
+ fs_info->pending_quota_state = 1;
+ }
+
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
@@ -2372,6 +2517,14 @@ retry_root_backup:
goto fail_block_groups;
}
+ ret = btrfs_init_dev_replace(fs_info);
+ if (ret) {
+ pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
ret = btrfs_init_space_info(fs_info);
if (ret) {
printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2383,6 +2536,15 @@ retry_root_backup:
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups;
}
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ if (fs_info->fs_devices->missing_devices >
+ fs_info->num_tolerated_disk_barrier_failures &&
+ !(sb->s_flags & MS_RDONLY)) {
+ printk(KERN_WARNING
+ "Btrfs: too many missing devices, writeable mount is not allowed\n");
+ goto fail_block_groups;
+ }
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
@@ -2415,17 +2577,19 @@ retry_root_backup:
" integrity check module %s\n", sb->s_id);
}
#endif
+ ret = btrfs_read_qgroup_config(fs_info);
+ if (ret)
+ goto fail_trans_kthread;
/* do not make disk changes in broken FS */
- if (btrfs_super_log_root(disk_super) != 0 &&
- !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+ if (btrfs_super_log_root(disk_super) != 0) {
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
printk(KERN_WARNING "Btrfs log replay required "
"on RO media\n");
err = -EIO;
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
blocksize =
btrfs_level_size(tree_root,
@@ -2434,7 +2598,7 @@ retry_root_backup:
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
err = -ENOMEM;
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
__setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2466,15 +2630,15 @@ retry_root_backup:
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret) {
- }
+ if (ret)
+ goto fail_trans_kthread;
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
printk(KERN_WARNING
"btrfs: failed to recover relocation\n");
err = -EINVAL;
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
}
@@ -2484,10 +2648,10 @@ retry_root_backup:
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (!fs_info->fs_root)
- goto fail_trans_kthread;
+ goto fail_qgroup;
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
if (sb->s_flags & MS_RDONLY)
@@ -2509,8 +2673,17 @@ retry_root_backup:
return ret;
}
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ pr_warn("btrfs: failed to resume dev_replace\n");
+ close_ctree(tree_root);
+ return ret;
+ }
+
return 0;
+fail_qgroup:
+ btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
fail_cleaner:
@@ -2543,6 +2716,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2762,12 +2936,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
printk_in_rcu("btrfs: disabling barriers on dev %s\n",
rcu_str_deref(device->name));
device->nobarriers = 1;
- }
- if (!bio_flagged(bio, BIO_UPTODATE)) {
+ } else if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
- if (!bio_flagged(bio, BIO_EOPNOTSUPP))
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
}
/* drop the reference from the wait == 0 run */
@@ -2806,14 +2978,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
- int errors = 0;
+ int errors_send = 0;
+ int errors_wait = 0;
int ret;
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
- errors++;
+ errors_send++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2821,13 +2994,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 0);
if (ret)
- errors++;
+ errors_send++;
}
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
- errors++;
+ errors_wait++;
continue;
}
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2835,13 +3008,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 1);
if (ret)
- errors++;
+ errors_wait++;
}
- if (errors)
+ if (errors_send > info->num_tolerated_disk_barrier_failures ||
+ errors_wait > info->num_tolerated_disk_barrier_failures)
return -EIO;
return 0;
}
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_space_info *sinfo;
+ u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ int num_types = 4;
+ int i;
+ int c;
+ int num_tolerated_disk_barrier_failures =
+ (int)fs_info->fs_devices->num_devices;
+
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ sinfo = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+ if (tmp->flags == types[i]) {
+ sinfo = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!sinfo)
+ continue;
+
+ down_read(&sinfo->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&sinfo->block_groups[c])) {
+ u64 flags;
+
+ btrfs_get_block_group_info(
+ &sinfo->block_groups[c], &space);
+ if (space.total_bytes == 0 ||
+ space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+ /*
+ * return
+ * 0: if dup, single or RAID0 is configured for
+ * any of metadata, system or data, else
+ * 1: if RAID5 is configured, or if RAID1 or
+ * RAID10 is configured and only two mirrors
+ * are used, else
+ * 2: if RAID6 is configured, else
+ * num_mirrors - 1: if RAID1 or RAID10 is
+ * configured and more than
+ * 2 mirrors are used.
+ */
+ if (num_tolerated_disk_barrier_failures > 0 &&
+ ((flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID0)) ||
+ ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+ == 0)))
+ num_tolerated_disk_barrier_failures = 0;
+ else if (num_tolerated_disk_barrier_failures > 1
+ &&
+ (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10)))
+ num_tolerated_disk_barrier_failures = 1;
+ }
+ }
+ up_read(&sinfo->groups_sem);
+ }
+
+ return num_tolerated_disk_barrier_failures;
+}
+
int write_all_supers(struct btrfs_root *root, int max_mirrors)
{
struct list_head *head;
@@ -2864,8 +3111,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
- if (do_barriers)
- barrier_all_devices(root->fs_info);
+ if (do_barriers) {
+ ret = barrier_all_devices(root->fs_info);
+ if (ret) {
+ mutex_unlock(
+ &root->fs_info->fs_devices->device_list_mutex);
+ btrfs_error(root->fs_info, ret,
+ "errors while submitting device barriers.");
+ return ret;
+ }
+ }
list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
@@ -3065,41 +3320,27 @@ int close_ctree(struct btrfs_root *root)
smp_mb();
/* pause restriper - we want to resume on mount */
- btrfs_pause_balance(root->fs_info);
+ btrfs_pause_balance(fs_info);
+
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(root);
+ btrfs_scrub_cancel(fs_info);
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(fs_info);
+ btrfs_cleanup_defrag_inodes(fs_info);
- /*
- * Here come 2 situations when btrfs is broken to flip readonly:
- *
- * 1. when btrfs flips readonly somewhere else before
- * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
- * and btrfs will skip to write sb directly to keep
- * ERROR state on disk.
- *
- * 2. when btrfs flips readonly just in btrfs_commit_super,
- * and in such case, btrfs cannot write sb via btrfs_commit_super,
- * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
- * btrfs will cleanup all FS resources first and write sb then.
- */
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- ret = btrfs_error_commit_super(root);
- if (ret)
- printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
- }
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ btrfs_error_commit_super(root);
btrfs_put_block_group_cache(fs_info);
@@ -3109,14 +3350,12 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 2;
smp_mb();
+ btrfs_free_qgroup_config(root->fs_info);
+
if (fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
(unsigned long long)fs_info->delalloc_bytes);
}
- if (fs_info->total_ref_cache_size) {
- printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
- (unsigned long long)fs_info->total_ref_cache_size);
- }
free_extent_buffer(fs_info->extent_root->node);
free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3128,6 +3367,10 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(fs_info->dev_root->commit_root);
free_extent_buffer(fs_info->csum_root->node);
free_extent_buffer(fs_info->csum_root->commit_root);
+ if (fs_info->quota_root) {
+ free_extent_buffer(fs_info->quota_root->node);
+ free_extent_buffer(fs_info->quota_root->commit_root);
+ }
btrfs_free_block_groups(fs_info);
@@ -3148,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
+ btrfs_stop_workers(&fs_info->flush_workers);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3192,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
int was_dirty;
btrfs_assert_tree_locked(buf);
- if (transid != root->fs_info->generation) {
- printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+ if (transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
"found %llu running %llu\n",
(unsigned long long)buf->start,
(unsigned long long)transid,
(unsigned long long)root->fs_info->generation);
- WARN_ON(1);
- }
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
@@ -3208,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
}
}
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+ int flush_delayed)
{
/*
* looks as though older kernels can get into trouble with
@@ -3220,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
if (current->flags & PF_MEMALLOC)
return;
- btrfs_balance_delayed_items(root);
+ if (flush_delayed)
+ btrfs_balance_delayed_items(root);
num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited_nr(
- root->fs_info->btree_inode->i_mapping, 1);
+ balance_dirty_pages_ratelimited(
+ root->fs_info->btree_inode->i_mapping);
}
return;
}
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
{
- /*
- * looks as though older kernels can get into trouble with
- * this code, they end up stuck in balance_dirty_pages forever
- */
- u64 num_dirty;
- unsigned long thresh = 32 * 1024 * 1024;
-
- if (current->flags & PF_MEMALLOC)
- return;
-
- num_dirty = root->fs_info->dirty_metadata_bytes;
+ __btrfs_btree_balance_dirty(root, 1);
+}
- if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited_nr(
- root->fs_info->btree_inode->i_mapping, 1);
- }
- return;
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+ __btrfs_btree_balance_dirty(root, 0);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
@@ -3258,52 +3491,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
}
-static int btree_lock_page_hook(struct page *page, void *data,
- void (*flush_fn)(void *))
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_buffer *eb;
-
- /*
- * We culled this eb but the page is still hanging out on the mapping,
- * carry on.
- */
- if (!PagePrivate(page))
- goto out;
-
- eb = (struct extent_buffer *)page->private;
- if (!eb) {
- WARN_ON(1);
- goto out;
- }
- if (page != eb->pages[0])
- goto out;
-
- if (!btrfs_try_tree_write_lock(eb)) {
- flush_fn(data);
- btrfs_tree_lock(eb);
- }
- btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
- if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
- spin_lock(&root->fs_info->delalloc_lock);
- if (root->fs_info->dirty_metadata_bytes >= eb->len)
- root->fs_info->dirty_metadata_bytes -= eb->len;
- else
- WARN_ON(1);
- spin_unlock(&root->fs_info->delalloc_lock);
- }
-
- btrfs_tree_unlock(eb);
-out:
- if (!trylock_page(page)) {
- flush_fn(data);
- lock_page(page);
- }
- return 0;
-}
-
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
@@ -3315,18 +3502,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
if (read_only)
return 0;
- if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- printk(KERN_WARNING "warning: mount fs with errors, "
- "running btrfsck is recommended\n");
- }
-
return 0;
}
-int btrfs_error_commit_super(struct btrfs_root *root)
+void btrfs_error_commit_super(struct btrfs_root *root)
{
- int ret;
-
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3336,10 +3516,6 @@ int btrfs_error_commit_super(struct btrfs_root *root)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(root);
-
- ret = write_ctree_super(NULL, root, 0);
-
- return ret;
}
static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
@@ -3517,7 +3693,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark);
+ mark, NULL);
if (ret)
break;
@@ -3572,7 +3748,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
again:
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret)
break;
@@ -3663,14 +3839,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
/* FIXME: cleanup wait for commit */
t->in_commit = 1;
t->blocked = 1;
+ smp_mb();
if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
wake_up(&root->fs_info->transaction_blocked_wait);
t->blocked = 0;
+ smp_mb();
if (waitqueue_active(&root->fs_info->transaction_wait))
wake_up(&root->fs_info->transaction_wait);
t->commit_done = 1;
+ smp_mb();
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
@@ -3706,7 +3885,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
}
static struct extent_io_ops btree_extent_io_ops = {
- .write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b3fab39f7..305c33efb0e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
-int btrfs_error_commit_super(struct btrfs_root *root);
+void btrfs_error_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -89,6 +89,14 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
int btrfs_cleanup_transaction(struct btrfs_root *root);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_root *root);
+void btrfs_abort_devices(struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6e1d36702ff..5a3327b8f90 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,9 @@
#include "volumes.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "math.h"
+
+#undef SCRAMBLE_DELAYED_REFS
/*
* control flags for do_chunk_alloc's force field
@@ -92,8 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 alloc_bytes,
- u64 flags, int force);
+ struct btrfs_root *extent_root, u64 flags,
+ int force);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -310,7 +313,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
while (start < end) {
ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE);
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL);
if (ret)
break;
@@ -646,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-static u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- do_div(num, 10);
- return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
- if (factor == 100)
- return num;
- num *= factor;
- do_div(num, 100);
- return num;
-}
-
u64 btrfs_find_block_group(struct btrfs_root *root,
u64 search_start, u64 search_hint, int owner)
{
@@ -1832,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+ ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
/* Error condition is -ENOMEM */
if (!ret) {
@@ -2217,6 +2203,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int count = 0;
int must_insert_reserved = 0;
@@ -2249,13 +2236,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
/*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ */
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
+
+ /*
* locked_ref is the head node, so we have to go one
* node back for any delayed ref updates
*/
ref = select_delayed_ref(locked_ref);
if (ref && ref->seq &&
- btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+ btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
/*
* there are still refs with lower seq numbers in the
* process of being added. Don't run this ref yet.
@@ -2300,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
kfree(extent_op);
if (ret) {
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+
printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
@@ -2315,12 +2315,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- /*
- * we modified num_entries, but as we're currently running
- * delayed refs, skip
- * wake_up(&delayed_refs->seq_wait);
- * here.
- */
+ if (locked_ref) {
+ /*
+ * when we play the delayed ref, also correct the
+ * ref_mod on head
+ */
+ switch (ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ locked_ref->node.ref_mod -= ref->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ locked_ref->node.ref_mod += ref->ref_mod;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ }
spin_unlock(&delayed_refs->lock);
ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2331,35 +2342,97 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
count++;
if (ret) {
+ if (locked_ref) {
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+ }
printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
}
next:
- do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024,
- btrfs_get_alloc_profile(root, 0),
- CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
return count;
}
-static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
- unsigned long num_refs,
- struct list_head *first_seq)
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int alt = 1;
+ u64 middle;
+ u64 first = 0, last = 0;
+
+ n = rb_first(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ first = entry->bytenr;
+ }
+ n = rb_last(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ last = entry->bytenr;
+ }
+ n = root->rb_node;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+
+ middle = entry->bytenr;
+
+ if (alt)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+
+ alt = 1 - alt;
+ }
+ return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
- spin_unlock(&delayed_refs->lock);
- pr_debug("waiting for more refs (num %ld, first %p)\n",
- num_refs, first_seq);
- wait_event(delayed_refs->seq_wait,
- num_refs != delayed_refs->num_entries ||
- delayed_refs->seq_head.next != first_seq);
- pr_debug("done waiting for more refs (num %ld, first %p)\n",
- delayed_refs->num_entries, delayed_refs->seq_head.next);
- spin_lock(&delayed_refs->lock);
+ struct qgroup_update *qgroup_update;
+ int ret = 0;
+
+ if (list_empty(&trans->qgroup_ref_list) !=
+ !trans->delayed_ref_elem.seq) {
+ /* list without seq or seq without list */
+ printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+ list_empty(&trans->qgroup_ref_list) ? "" : " not",
+ trans->delayed_ref_elem.seq);
+ BUG();
+ }
+
+ if (!trans->delayed_ref_elem.seq)
+ return 0;
+
+ while (!list_empty(&trans->qgroup_ref_list)) {
+ qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+ struct qgroup_update, list);
+ list_del(&qgroup_update->list);
+ if (!ret)
+ ret = btrfs_qgroup_account_ref(
+ trans, fs_info, qgroup_update->node,
+ qgroup_update->extent_op);
+ kfree(qgroup_update);
+ }
+
+ btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+ return ret;
}
/*
@@ -2379,13 +2452,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
- struct list_head *first_seq = NULL;
int ret;
u64 delayed_start;
int run_all = count == (unsigned long)-1;
int run_most = 0;
- unsigned long num_refs = 0;
- int consider_waiting;
+ int loops;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2394,15 +2465,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
- do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
- CHUNK_ALLOC_NO_FORCE);
+ btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
- consider_waiting = 0;
+ loops = 0;
spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+ delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
if (count == 0) {
count = delayed_refs->num_entries * 2;
run_most = 1;
@@ -2424,31 +2498,6 @@ again:
if (ret)
break;
- if (delayed_start >= delayed_refs->run_delayed_start) {
- if (consider_waiting == 0) {
- /*
- * btrfs_find_ref_cluster looped. let's do one
- * more cycle. if we don't run any delayed ref
- * during that cycle (because we can't because
- * all of them are blocked) and if the number of
- * refs doesn't change, we avoid busy waiting.
- */
- consider_waiting = 1;
- num_refs = delayed_refs->num_entries;
- first_seq = root->fs_info->tree_mod_seq_list.next;
- } else {
- wait_for_more_refs(delayed_refs,
- num_refs, first_seq);
- /*
- * after waiting, things have changed. we
- * dropped the lock and someone else might have
- * run some refs, built new clusters and so on.
- * therefore, we restart staleness detection.
- */
- consider_waiting = 0;
- }
- }
-
ret = run_clustered_refs(trans, root, &cluster);
if (ret < 0) {
spin_unlock(&delayed_refs->lock);
@@ -2461,13 +2510,36 @@ again:
if (count == 0)
break;
- if (ret || delayed_refs->run_delayed_start == 0) {
+ if (delayed_start >= delayed_refs->run_delayed_start) {
+ if (loops == 0) {
+ /*
+ * btrfs_find_ref_cluster looped. let's do one
+ * more cycle. if we don't run any delayed ref
+ * during that cycle (because we can't because
+ * all of them are blocked), bail out.
+ */
+ loops = 1;
+ } else {
+ /*
+ * no runnable refs left, stop trying
+ */
+ BUG_ON(run_all);
+ break;
+ }
+ }
+ if (ret) {
/* refs were run, let's reset staleness detection */
- consider_waiting = 0;
+ loops = 0;
}
}
if (run_all) {
+ if (!list_empty(&trans->new_bgs)) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_create_pending_block_groups(trans, root);
+ spin_lock(&delayed_refs->lock);
+ }
+
node = rb_first(&delayed_refs->root);
if (!node)
goto out;
@@ -2502,6 +2574,7 @@ again:
}
out:
spin_unlock(&delayed_refs->lock);
+ assert_qgroups_uptodate(trans);
return 0;
}
@@ -2581,8 +2654,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
node = rb_prev(node);
if (node) {
+ int seq = ref->seq;
+
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- if (ref->bytenr == bytenr)
+ if (ref->bytenr == bytenr && ref->seq == seq)
goto out_unlock;
}
@@ -2903,25 +2978,29 @@ again:
}
spin_lock(&block_group->lock);
- if (block_group->cached != BTRFS_CACHE_FINISHED) {
- /* We're not cached, don't bother trying to write stuff out */
+ if (block_group->cached != BTRFS_CACHE_FINISHED ||
+ !btrfs_test_opt(root, SPACE_CACHE)) {
+ /*
+ * don't bother trying to write stuff out _if_
+ * a) we're not cached,
+ * b) we're with nospace_cache mount option.
+ */
dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
goto out_put;
}
spin_unlock(&block_group->lock);
- num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+ /*
+ * Try to preallocate enough space based on how big the block group is.
+ * Keep in mind this has to include any pinned space which could end up
+ * taking up quite a bit since it's not folded into the other space
+ * cache.
+ */
+ num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
if (!num_pages)
num_pages = 1;
- /*
- * Just to make absolutely sure we have enough space, we're going to
- * preallocate 12 pages worth of space for each block group. In
- * practice we ought to use at most 8, but we need extra space so we can
- * add our header and have a terminator between the extents and the
- * bitmaps.
- */
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
@@ -3134,6 +3213,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
init_waitqueue_head(&found->wait);
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ info->data_sinfo = found;
return 0;
}
@@ -3263,12 +3344,6 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return get_alloc_profile(root, flags);
}
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
- BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- BTRFS_BLOCK_GROUP_DATA);
-}
-
/*
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
@@ -3277,6 +3352,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 used;
int ret = 0, committed = 0, alloc_chunk = 1;
@@ -3289,7 +3365,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
committed = 1;
}
- data_sinfo = BTRFS_I(inode)->space_info;
+ data_sinfo = fs_info->data_sinfo;
if (!data_sinfo)
goto alloc;
@@ -3319,7 +3395,6 @@ alloc:
return PTR_ERR(trans);
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- bytes + 2 * 1024 * 1024,
alloc_target,
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
@@ -3330,10 +3405,9 @@ alloc:
goto commit_trans;
}
- if (!data_sinfo) {
- btrfs_set_inode_space_info(root, inode);
- data_sinfo = BTRFS_I(inode)->space_info;
- }
+ if (!data_sinfo)
+ data_sinfo = fs_info->data_sinfo;
+
goto again;
}
@@ -3380,7 +3454,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
- data_sinfo = BTRFS_I(inode)->space_info;
+ data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
@@ -3402,8 +3476,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
static int should_alloc_chunk(struct btrfs_root *root,
- struct btrfs_space_info *sinfo, u64 alloc_bytes,
- int force)
+ struct btrfs_space_info *sinfo, int force)
{
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3418,7 +3491,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
* and purposes it's used space. Don't worry about locking the
* global_rsv, it doesn't change except when the transaction commits.
*/
- num_allocated += global_rsv->size;
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+ num_allocated += global_rsv->size;
/*
* in limited mode, we want to have some free space up to
@@ -3432,15 +3506,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
- thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
-
- /* 256MB or 2% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
- /* system chunks need a much small threshold */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
- thresh = 32 * 1024 * 1024;
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+ if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -3490,8 +3557,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
}
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 alloc_bytes,
- u64 flags, int force)
+ struct btrfs_root *extent_root, u64 flags, int force)
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3515,7 +3581,7 @@ again:
return 0;
}
- if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+ if (!should_alloc_chunk(extent_root, space_info, force)) {
spin_unlock(&space_info->lock);
return 0;
} else if (space_info->chunk_alloc) {
@@ -3583,92 +3649,125 @@ out:
return ret;
}
+static int can_overcommit(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 avail;
+ u64 used;
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ avail = root->fs_info->free_chunk_space;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually useable.
+ */
+ if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ avail >>= 1;
+
+ /*
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ avail >>= 3;
+ else
+ avail >>= 1;
+
+ if (used + bytes < space_info->total_bytes + avail)
+ return 1;
+ return 0;
+}
+
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+ unsigned long nr_pages,
+ enum wb_reason reason)
+{
+ if (!writeback_in_progress(sb->s_bdi) &&
+ down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, reason);
+ up_read(&sb->s_umount);
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* shrink metadata reservation for delalloc
*/
-static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
- bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+ bool wait_ordered)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
- u64 reserved;
+ u64 delalloc_bytes;
u64 max_reclaim;
- u64 reclaimed = 0;
long time_left;
unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
- unsigned long progress;
+ enum btrfs_reserve_flush_enum flush;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
smp_mb();
- reserved = space_info->bytes_may_use;
- progress = space_info->reservation_progress;
-
- if (reserved == 0)
- return 0;
-
- smp_mb();
- if (root->fs_info->delalloc_bytes == 0) {
+ delalloc_bytes = root->fs_info->delalloc_bytes;
+ if (delalloc_bytes == 0) {
if (trans)
- return 0;
- btrfs_wait_ordered_extents(root, 0, 0);
- return 0;
+ return;
+ btrfs_wait_ordered_extents(root, 0);
+ return;
}
- max_reclaim = min(reserved, to_reclaim);
- nr_pages = max_t(unsigned long, nr_pages,
- max_reclaim >> PAGE_CACHE_SHIFT);
- while (loops < 1024) {
- /* have the flusher threads jump in and do some IO */
- smp_mb();
- nr_pages = min_t(unsigned long, nr_pages,
- root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
+ while (delalloc_bytes && loops < 3) {
+ max_reclaim = min(delalloc_bytes, to_reclaim);
+ nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+ writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+ nr_pages,
+ WB_REASON_FS_FREE_SPACE);
+
+ /*
+ * We need to wait for the async pages to actually start before
+ * we do anything.
+ */
+ wait_event(root->fs_info->async_submit_wait,
+ !atomic_read(&root->fs_info->async_delalloc_pages));
+ if (!trans)
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ else
+ flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
- if (reserved > space_info->bytes_may_use)
- reclaimed += reserved - space_info->bytes_may_use;
- reserved = space_info->bytes_may_use;
+ if (can_overcommit(root, space_info, orig, flush)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
spin_unlock(&space_info->lock);
loops++;
-
- if (reserved == 0 || reclaimed >= max_reclaim)
- break;
-
- if (trans && trans->transaction->blocked)
- return -EAGAIN;
-
if (wait_ordered && !trans) {
- btrfs_wait_ordered_extents(root, 0, 0);
+ btrfs_wait_ordered_extents(root, 0);
} else {
- time_left = schedule_timeout_interruptible(1);
-
- /* We were interrupted, exit */
+ time_left = schedule_timeout_killable(1);
if (time_left)
break;
}
-
- /* we've kicked the IO a few times, if anything has been freed,
- * exit. There is no sense in looping here for a long time
- * when we really need to commit the transaction, or there are
- * just too many writers without enough free space
- */
-
- if (loops > 3) {
- smp_mb();
- if (progress != space_info->reservation_progress)
- break;
- }
-
+ smp_mb();
+ delalloc_bytes = root->fs_info->delalloc_bytes;
}
-
- return reclaimed >= to_reclaim;
}
/**
@@ -3728,12 +3827,78 @@ commit:
return btrfs_commit_transaction(trans, root);
}
+enum flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
+};
+
+static int flush_space(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 num_bytes,
+ u64 orig_bytes, int state)
+{
+ struct btrfs_trans_handle *trans;
+ int nr;
+ int ret = 0;
+
+ switch (state) {
+ case FLUSH_DELAYED_ITEMS_NR:
+ case FLUSH_DELAYED_ITEMS:
+ if (state == FLUSH_DELAYED_ITEMS_NR) {
+ u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+ nr = (int)div64_u64(num_bytes, bytes);
+ if (!nr)
+ nr = 1;
+ nr *= 2;
+ } else {
+ nr = -1;
+ }
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_run_delayed_items_nr(trans, root, nr);
+ btrfs_end_transaction(trans, root);
+ break;
+ case FLUSH_DELALLOC:
+ case FLUSH_DELALLOC_WAIT:
+ shrink_delalloc(root, num_bytes, orig_bytes,
+ state == FLUSH_DELALLOC_WAIT);
+ break;
+ case ALLOC_CHUNK:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+ btrfs_end_transaction(trans, root);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ case COMMIT_TRANS:
+ ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+ break;
+ default:
+ ret = -ENOSPC;
+ break;
+ }
+
+ return ret;
+}
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
* @block_rsv - the block_rsv we're allocating for
* @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
*
* This will reserve orgi_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -3744,25 +3909,25 @@ commit:
*/
static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes, int flush)
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 used;
u64 num_bytes = orig_bytes;
- int retries = 0;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
int ret = 0;
- bool committed = false;
bool flushing = false;
- bool wait_ordered = false;
again:
ret = 0;
spin_lock(&space_info->lock);
/*
- * We only want to wait if somebody other than us is flushing and we are
- * actually alloed to flush.
+ * We only want to wait if somebody other than us is flushing and we
+ * are actually allowed to flush all things.
*/
- while (flush && !flushing && space_info->flush) {
+ while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+ space_info->flush) {
spin_unlock(&space_info->lock);
/*
* If we have a trans handle we can't wait because the flusher
@@ -3812,111 +3977,57 @@ again:
* amount plus the amount of bytes that we need for this
* reservation.
*/
- wait_ordered = true;
num_bytes = used - space_info->total_bytes +
- (orig_bytes * (retries + 1));
+ (orig_bytes * 2);
}
- if (ret) {
- u64 profile = btrfs_get_alloc_profile(root, 0);
- u64 avail;
-
- /*
- * If we have a lot of space that's pinned, don't bother doing
- * the overcommit dance yet and just commit the transaction.
- */
- avail = (space_info->total_bytes - space_info->bytes_used) * 8;
- do_div(avail, 10);
- if (space_info->bytes_pinned >= avail && flush && !committed) {
- space_info->flush = 1;
- flushing = true;
- spin_unlock(&space_info->lock);
- ret = may_commit_transaction(root, space_info,
- orig_bytes, 1);
- if (ret)
- goto out;
- committed = true;
- goto again;
- }
-
- spin_lock(&root->fs_info->free_chunk_lock);
- avail = root->fs_info->free_chunk_space;
-
- /*
- * If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable.
- */
- if (profile & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))
- avail >>= 1;
-
- /*
- * If we aren't flushing don't let us overcommit too much, say
- * 1/8th of the space. If we can flush, let it overcommit up to
- * 1/2 of the space.
- */
- if (flush)
- avail >>= 3;
- else
- avail >>= 1;
- spin_unlock(&root->fs_info->free_chunk_lock);
-
- if (used + num_bytes < space_info->total_bytes + avail) {
- space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags, orig_bytes, 1);
- ret = 0;
- } else {
- wait_ordered = true;
- }
+ if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
}
/*
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
* stealing it from us.
+ *
+ * We make the other tasks wait for the flush only when we can flush
+ * all things.
*/
- if (ret && flush) {
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
}
spin_unlock(&space_info->lock);
- if (!ret || !flush)
- goto out;
-
- /*
- * We do synchronous shrinking since we don't actually unreserve
- * metadata until after the IO is completed.
- */
- ret = shrink_delalloc(root, num_bytes, wait_ordered);
- if (ret < 0)
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
goto out;
- ret = 0;
+ ret = flush_space(root, space_info, num_bytes, orig_bytes,
+ flush_state);
+ flush_state++;
/*
- * So if we were overcommitted it's possible that somebody else flushed
- * out enough space and we simply didn't have enough space to reclaim,
- * so go back around and try again.
+ * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+ * would happen. So skip delalloc flush.
*/
- if (retries < 2) {
- wait_ordered = true;
- retries++;
- goto again;
- }
+ if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT))
+ flush_state = ALLOC_CHUNK;
- ret = -ENOSPC;
- if (committed)
- goto out;
-
- ret = may_commit_transaction(root, space_info, orig_bytes, 0);
- if (!ret) {
- committed = true;
+ if (!ret)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ flush_state < COMMIT_TRANS)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ flush_state <= COMMIT_TRANS)
goto again;
- }
out:
if (flushing) {
@@ -3934,7 +4045,10 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (root->ref_cows || root == root->fs_info->csum_root)
+ if (root->ref_cows)
+ block_rsv = trans->block_rsv;
+
+ if (root == root->fs_info->csum_root && trans->adding_csums)
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -4031,13 +4145,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
+ rsv->type = type;
}
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4046,7 +4162,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
if (!block_rsv)
return NULL;
- btrfs_init_block_rsv(block_rsv);
+ btrfs_init_block_rsv(block_rsv, type);
block_rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
return block_rsv;
@@ -4055,13 +4171,15 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
+ if (!rsv)
+ return;
btrfs_block_rsv_release(root, rsv, (u64)-1);
kfree(rsv);
}
-static inline int __block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
int ret;
@@ -4077,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor)
{
@@ -4109,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
return ret;
}
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -4139,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes)
@@ -4286,6 +4376,9 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ if (!trans->block_rsv)
+ return;
+
if (!trans->bytes_reserved)
return;
@@ -4330,10 +4423,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
/*
- * two for root back/forward refs, two for directory entries
- * and one for root of the snapshot.
+ * two for root back/forward refs, two for directory entries,
+ * one for root of the snapshot and one for parent inode.
*/
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
dst_rsv->space_info = src_rsv->space_info;
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -4440,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 csum_bytes;
unsigned nr_extents = 0;
int extra_reserve = 0;
- int flush = 1;
- int ret;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+ int ret = 0;
+ bool delalloc_lock = true;
- /* Need to be holding the i_mutex here if we aren't free space cache */
- if (btrfs_is_free_space_inode(root, inode))
- flush = 0;
+ /* If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ delalloc_lock = false;
+ }
- if (flush && btrfs_transaction_in_commit(root->fs_info))
+ if (flush != BTRFS_RESERVE_NO_FLUSH &&
+ btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
- mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+ if (delalloc_lock)
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4476,7 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
- ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ if (root->fs_info->quota_enabled)
+ ret = btrfs_qgroup_reserve(root, num_bytes +
+ nr_extents * root->leafsize);
+
+ /*
+ * ret != 0 here means the qgroup reservation failed, we go straight to
+ * the shared error handling then.
+ */
+ if (ret == 0)
+ ret = reserve_metadata_bytes(root, block_rsv,
+ to_reserve, flush);
+
if (ret) {
u64 to_free = 0;
unsigned dropped;
@@ -4506,7 +4620,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
btrfs_ino(inode),
to_free, 0);
}
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ if (root->fs_info->quota_enabled) {
+ btrfs_qgroup_free(root, num_bytes +
+ nr_extents * root->leafsize);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
@@ -4518,7 +4637,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4554,6 +4675,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
+ if (root->fs_info->quota_enabled) {
+ btrfs_qgroup_free(root, num_bytes +
+ dropped * root->leafsize);
+ }
+
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
}
@@ -4863,9 +4989,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 len;
+ bool readonly;
while (start <= end) {
+ readonly = false;
if (!cache ||
start >= cache->key.objectid + cache->key.offset) {
if (cache)
@@ -4883,15 +5013,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
}
start += len;
+ space_info = cache->space_info;
- spin_lock(&cache->space_info->lock);
+ spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- if (cache->ro)
- cache->space_info->bytes_readonly += len;
+ space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ space_info->bytes_readonly += len;
+ readonly = true;
+ }
spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ if (!readonly && global_rsv->space_info == space_info) {
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ len = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += len;
+ space_info->bytes_may_use += len;
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ }
+ spin_unlock(&global_rsv->lock);
+ }
+ spin_unlock(&space_info->lock);
}
if (cache)
@@ -4918,7 +5063,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY, NULL);
if (ret)
break;
@@ -4996,8 +5141,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
is_data);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -5015,8 +5162,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root,
path->nodes[0]);
}
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
extent_slot = path->slots[0];
}
} else if (ret == -ENOENT) {
@@ -5030,7 +5179,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)owner_objectid,
(unsigned long long)owner_offset);
} else {
- goto abort;
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
}
leaf = path->nodes[0];
@@ -5040,8 +5190,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(found_extent || extent_slot != path->slots[0]);
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
path->leave_spinning = 1;
@@ -5058,8 +5210,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(unsigned long long)bytenr);
btrfs_print_leaf(extent_root, path->nodes[0]);
}
- if (ret < 0)
- goto abort;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
extent_slot = path->slots[0];
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5096,8 +5251,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
is_data);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
} else {
if (found_extent) {
@@ -5114,27 +5271,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
- if (ret)
- goto abort;
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
}
out:
btrfs_free_path(path);
return ret;
-
-abort:
- btrfs_abort_transaction(trans, extent_root, ret);
- goto out;
}
/*
@@ -5190,8 +5349,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
rb_erase(&head->node.rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (waitqueue_active(&delayed_refs->seq_wait))
- wake_up(&delayed_refs->seq_wait);
/*
* we don't take a ref on the node because we're removing it from the
@@ -5348,7 +5505,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
{
int index;
@@ -5368,7 +5525,7 @@ static int __get_block_group_index(u64 flags)
static int get_block_group_index(struct btrfs_block_group_cache *cache)
{
- return __get_block_group_index(cache->flags);
+ return __get_raid_index(cache->flags);
}
enum btrfs_loop_type {
@@ -5399,11 +5556,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *used_block_group;
u64 search_start = 0;
int empty_cluster = 2 * 1024 * 1024;
- int allowed_chunk_alloc = 0;
- int done_chunk_alloc = 0;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = 0;
+ int index = __get_raid_index(data);
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
@@ -5432,9 +5587,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
if (btrfs_mixed_space_info(space_info))
use_cluster = false;
- if (orig_root->ref_cows || empty_size)
- allowed_chunk_alloc = 1;
-
if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
@@ -5708,10 +5860,6 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- if (offset < search_start)
- btrfs_add_free_space(used_block_group, offset,
- search_start - offset);
- BUG_ON(offset > search_start);
if (used_block_group != block_group)
btrfs_put_block_group(used_block_group);
btrfs_put_block_group(block_group);
@@ -5744,30 +5892,17 @@ loop:
index = 0;
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
- if (allowed_chunk_alloc) {
- ret = do_chunk_alloc(trans, root, num_bytes +
- 2 * 1024 * 1024, data,
- CHUNK_ALLOC_LIMITED);
- if (ret < 0) {
- btrfs_abort_transaction(trans,
- root, ret);
- goto out;
- }
- allowed_chunk_alloc = 0;
- if (ret == 1)
- done_chunk_alloc = 1;
- } else if (!done_chunk_alloc &&
- space_info->force_alloc ==
- CHUNK_ALLOC_NO_FORCE) {
- space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+ ret = do_chunk_alloc(trans, root, data,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * Do not bail out on ENOSPC since we
+ * can do more things.
+ */
+ if (ret < 0 && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto out;
}
-
- /*
- * We didn't allocate a chunk, go ahead and drop the
- * empty size and loop again.
- */
- if (!done_chunk_alloc)
- loop = LOOP_NO_EMPTY_SIZE;
}
if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5816,13 +5951,13 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
- printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
- "%llu pinned %llu reserved\n",
+ printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
(unsigned long long)cache->key.objectid,
(unsigned long long)cache->key.offset,
(unsigned long long)btrfs_block_group_used(&cache->item),
(unsigned long long)cache->pinned,
- (unsigned long long)cache->reserved);
+ (unsigned long long)cache->reserved,
+ cache->ro ? "[readonly]" : "");
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
}
@@ -5842,20 +5977,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
data = btrfs_get_alloc_profile(root, data);
again:
- /*
- * the only place that sets empty_size is btrfs_realloc_node, which
- * is not called recursively on allocations
- */
- if (empty_size || root->ref_cows) {
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes + 2 * 1024 * 1024, data,
- CHUNK_ALLOC_NO_FORCE);
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
- }
-
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
hint_byte, ins, data);
@@ -5865,12 +5986,6 @@ again:
num_bytes = num_bytes >> 1;
num_bytes = num_bytes & ~(root->sectorsize - 1);
num_bytes = max(num_bytes, min_alloc_size);
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, CHUNK_ALLOC_FORCE);
- if (ret < 0 && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -6193,7 +6308,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
@@ -6212,15 +6328,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
ret = block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
- if (ret) {
+ if (ret && !block_rsv->failfast) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
/*DEFAULT_RATELIMIT_BURST*/ 2);
- if (__ratelimit(&_rs)) {
- printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
- WARN_ON(1);
- }
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+ ret);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
@@ -6670,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return 1;
}
}
@@ -7177,7 +7295,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
alloc_flags = update_block_group_flags(root, cache->flags);
if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
@@ -7187,7 +7305,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ ret = do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
@@ -7201,7 +7319,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 type)
{
u64 alloc_flags = get_alloc_profile(root, type);
- return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ return do_chunk_alloc(trans, root, alloc_flags,
CHUNK_ALLOC_FORCE);
}
@@ -7351,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
target = get_restripe_target(root->fs_info, block_group->flags);
if (target) {
- index = __get_block_group_index(extended_to_chunk(target));
+ index = __get_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -7385,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* check to make sure we can actually find a chunk with enough
* space to fit our block group in.
*/
- if (device->total_bytes > device->bytes_used + min_free) {
+ if (device->total_bytes > device->bytes_used + min_free &&
+ !device->is_tgtdev_for_dev_replace) {
ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
@@ -7610,8 +7729,21 @@ int btrfs_read_block_groups(struct btrfs_root *root)
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
- if (need_clear)
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
cache->disk_cache_state = BTRFS_DC_CLEAR;
+ if (btrfs_test_opt(root, SPACE_CACHE))
+ cache->dirty = 1;
+ }
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7695,6 +7827,34 @@ error:
return ret;
}
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_block_group_item item;
+ struct btrfs_key key;
+ int ret = 0;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+ new_bg_list) {
+ list_del_init(&block_group->new_bg_list);
+
+ if (ret)
+ continue;
+
+ spin_lock(&block_group->lock);
+ memcpy(&item, &block_group->item, sizeof(item));
+ memcpy(&key, &block_group->key, sizeof(key));
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_insert_item(trans, extent_root, &key, &item,
+ sizeof(item));
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
+ }
+}
+
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7728,6 +7888,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->new_bg_list);
btrfs_init_free_space_ctl(cache);
@@ -7759,12 +7920,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret); /* Logic error */
- ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
- sizeof(cache->item));
- if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
- return ret;
- }
+ list_add_tail(&cache->new_bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 01c21b6c6d4..1b319df29ee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
struct bio *bio;
struct extent_io_tree *tree;
get_extent_t *get_extent;
+ unsigned long bio_flags;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
@@ -64,13 +65,13 @@ tree_fs_info(struct extent_io_tree *tree)
int __init extent_io_init(void)
{
- extent_state_cache = kmem_cache_create("extent_state",
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
- extent_buffer_cache = kmem_cache_create("extent_buffers",
+ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
@@ -107,6 +108,12 @@ void extent_io_exit(void)
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
+
+ /*
+ * Make sure all delayed rcu free are flushed before we
+ * destroy caches.
+ */
+ rcu_barrier();
if (extent_state_cache)
kmem_cache_destroy(extent_state_cache);
if (extent_buffer_cache)
@@ -334,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
{
struct rb_node *node;
- if (end < start) {
- printk(KERN_ERR "btrfs end < start %llu %llu\n",
+ if (end < start)
+ WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
(unsigned long long)end,
(unsigned long long)start);
- WARN_ON(1);
- }
state->start = start;
state->end = end;
@@ -929,12 +934,14 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
/**
- * convert_extent - convert all bits in a given range from one bit to another
+ * convert_extent_bit - convert all bits in a given range from one bit to
+ * another
* @tree: the io tree to search
* @start: the start offset in bytes
* @end: the end offset in bytes (inclusive)
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
* @mask: the allocation mask
*
* This will go through and set bits for the given range. If any states exist
@@ -944,7 +951,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int clear_bits, gfp_t mask)
+ int bits, int clear_bits,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -961,6 +969,15 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+
/*
* this search will find all the extents that end after
* our range starts.
@@ -991,6 +1008,7 @@ hit_next:
*/
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
@@ -1031,6 +1049,7 @@ hit_next:
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
@@ -1069,6 +1088,7 @@ hit_next:
&bits);
if (err)
extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -1091,6 +1111,7 @@ hit_next:
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
clear_state_bit(tree, prealloc, &clear_bits, 0);
prealloc = NULL;
goto out;
@@ -1143,6 +1164,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
NULL, cached_state, mask);
}
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
@@ -1287,18 +1316,42 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, int bits)
+ u64 *start_ret, u64 *end_ret, int bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
+ struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && state->tree) {
+ n = rb_next(&state->rb_node);
+ while (n) {
+ state = rb_entry(n, struct extent_state,
+ rb_node);
+ if (state->state & bits)
+ goto got_it;
+ n = rb_next(n);
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
state = find_first_extent_bit_state(tree, start, bits);
+got_it:
if (state) {
+ cache_state(state, cached_state);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
}
+out:
spin_unlock(&tree->lock);
return ret;
}
@@ -1864,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
* the standard behavior is to write all copies in a raid setup. here we only
* want to write the one bad copy. so we do the mapping for ourselves and issue
* submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
* actually prevents the read that triggered the error from finishing.
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
u64 length, u64 logical, struct page *page,
int mirror_num)
{
@@ -1891,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
bio->bi_size = 0;
map_length = length;
- ret = btrfs_map_block(map_tree, WRITE, logical,
+ ret = btrfs_map_block(fs_info, WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
bio_put(bio);
@@ -1918,7 +1971,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
return -EIO;
}
- printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+ printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
"(dev %s sector %llu)\n", page->mapping->host->i_ino,
start, rcu_str_deref(dev->name), sector);
@@ -1929,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num)
{
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
u64 start = eb->start;
unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
int ret = 0;
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+ ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
start, p, mirror_num);
if (ret)
break;
@@ -1955,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct btrfs_mapping_tree *map_tree;
+ struct btrfs_fs_info *fs_info;
struct extent_state *state;
int num_copies;
int did_repair = 0;
@@ -1991,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
spin_unlock(&BTRFS_I(inode)->io_tree.lock);
if (state && state->start == failrec->start) {
- map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
- num_copies = btrfs_num_copies(map_tree, failrec->logical,
- failrec->len);
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ num_copies = btrfs_num_copies(fs_info, failrec->logical,
+ failrec->len);
if (num_copies > 1) {
- ret = repair_io_failure(map_tree, start, failrec->len,
+ ret = repair_io_failure(fs_info, start, failrec->len,
failrec->logical, page,
failrec->failed_mirror);
did_repair = !ret;
@@ -2061,7 +2113,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
}
read_unlock(&em_tree->lock);
- if (!em || IS_ERR(em)) {
+ if (!em) {
kfree(failrec);
return -EIO;
}
@@ -2104,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
* clean_io_failure() clean all those errors at once.
*/
}
- num_copies = btrfs_num_copies(
- &BTRFS_I(inode)->root->fs_info->mapping_tree,
- failrec->logical, failrec->len);
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
if (num_copies == 1) {
/*
* we only have a single copy of the data, so don't bother with
@@ -2297,8 +2348,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
struct extent_state *cached = NULL;
struct extent_state *state;
- pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
- "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+ pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+ "mirror=%ld\n", (u64)bio->bi_sector, err,
(long int)bio->bi_bdev);
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2329,23 +2380,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
state, mirror);
- if (ret) {
- /* no IO indicated but software detected errors
- * in the block, either checksum errors or
- * issues with the contents */
- struct btrfs_root *root =
- BTRFS_I(page->mapping->host)->root;
- struct btrfs_device *device;
-
+ if (ret)
uptodate = 0;
- device = btrfs_find_device_for_logical(
- root, start, mirror);
- if (device)
- btrfs_dev_stat_inc_and_print(device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- } else {
+ else
clean_io_failure(start, page);
- }
}
if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -2424,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
return bio;
}
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
static int __must_check submit_one_bio(int rw, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
@@ -2715,12 +2749,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag);
- BUG_ON(ret == -ENOMEM);
- nr++;
- *bio_flags = this_bio_flag;
+ if (!ret) {
+ nr++;
+ *bio_flags = this_bio_flag;
+ }
}
- if (ret)
+ if (ret) {
SetPageError(page);
+ unlock_extent(tree, cur, cur + iosize - 1);
+ }
cur = cur + iosize;
pg_offset += iosize;
}
@@ -3077,8 +3114,15 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
}
}
+ /*
+ * We need to do this to prevent races in people who check if the eb is
+ * under IO since we can end up having no IO bits set for a short period
+ * of time.
+ */
+ spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
spin_lock(&fs_info->delalloc_lock);
if (fs_info->dirty_metadata_bytes >= eb->len)
@@ -3087,6 +3131,8 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
WARN_ON(1);
spin_unlock(&fs_info->delalloc_lock);
ret = 1;
+ } else {
+ spin_unlock(&eb->refs_lock);
}
btrfs_tree_unlock(eb);
@@ -3158,12 +3204,16 @@ static int write_one_eb(struct extent_buffer *eb,
struct block_device *bdev = fs_info->fs_devices->latest_bdev;
u64 offset = eb->start;
unsigned long i, num_pages;
+ unsigned long bio_flags = 0;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
int ret = 0;
clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
atomic_set(&eb->io_pages, num_pages);
+ if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+ bio_flags = EXTENT_BIO_TREE_LOG;
+
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
@@ -3172,7 +3222,8 @@ static int write_one_eb(struct extent_buffer *eb,
ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
- 0, 0, 0);
+ 0, epd->bio_flags, bio_flags);
+ epd->bio_flags = bio_flags;
if (ret) {
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
SetPageError(p);
@@ -3207,6 +3258,7 @@ int btree_write_cache_pages(struct address_space *mapping,
.tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
int ret = 0;
int done = 0;
@@ -3251,19 +3303,34 @@ retry:
break;
}
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ continue;
+ }
+
eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON
+ * but no sense in crashing the users box for something
+ * we can survive anyway.
+ */
if (!eb) {
+ spin_unlock(&mapping->private_lock);
WARN_ON(1);
continue;
}
- if (eb == prev_eb)
+ if (eb == prev_eb) {
+ spin_unlock(&mapping->private_lock);
continue;
+ }
- if (!atomic_inc_not_zero(&eb->refs)) {
- WARN_ON(1);
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
continue;
- }
prev_eb = eb;
ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
@@ -3454,7 +3521,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
if (epd->sync_io)
rw = WRITE_SYNC;
- ret = submit_one_bio(rw, epd->bio, 0, 0);
+ ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
BUG_ON(ret < 0); /* -ENOMEM */
epd->bio = NULL;
}
@@ -3477,6 +3544,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
ret = __extent_writepage(page, wbc, &epd);
@@ -3501,6 +3569,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.get_extent = get_extent,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
@@ -3540,6 +3609,7 @@ int extent_writepages(struct extent_io_tree *tree,
.get_extent = get_extent,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
};
ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3557,19 +3627,38 @@ int extent_readpages(struct extent_io_tree *tree,
struct bio *bio = NULL;
unsigned page_idx;
unsigned long bio_flags = 0;
+ struct page *pagepool[16];
+ struct page *page;
+ int i = 0;
+ int nr = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ page = list_entry(pages->prev, struct page, lru);
prefetchw(&page->flags);
list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping,
+ if (add_to_page_cache_lru(page, mapping,
page->index, GFP_NOFS)) {
- __extent_read_full_page(tree, page, get_extent,
- &bio, 0, &bio_flags);
+ page_cache_release(page);
+ continue;
}
- page_cache_release(page);
+
+ pagepool[nr++] = page;
+ if (nr < ARRAY_SIZE(pagepool))
+ continue;
+ for (i = 0; i < nr; i++) {
+ __extent_read_full_page(tree, pagepool[i], get_extent,
+ &bio, 0, &bio_flags);
+ page_cache_release(pagepool[i]);
+ }
+ nr = 0;
+ }
+ for (i = 0; i < nr; i++) {
+ __extent_read_full_page(tree, pagepool[i], get_extent,
+ &bio, 0, &bio_flags);
+ page_cache_release(pagepool[i]);
}
+
BUG_ON(!list_empty(pages));
if (bio)
return submit_one_bio(READ, bio, 0, bio_flags);
@@ -3898,18 +3987,6 @@ out:
return ret;
}
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
- unsigned long i)
-{
- return eb->pages[i];
-}
-
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
- return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
- (start >> PAGE_CACHE_SHIFT);
-}
-
static void __free_extent_buffer(struct extent_buffer *eb)
{
#if LEAK_DEBUG
@@ -4025,8 +4102,8 @@ st