From 7d0fa3ecba2f12ceef93fffe615e5dd9b50bb794 Mon Sep 17 00:00:00 2001 From: Alain Renaud Date: Fri, 8 Jun 2012 15:34:46 -0400 Subject: xfs: xfs_vm_writepage clear iomap_valid when !buffer_uptodate (REV2) On filesytems with a block size smaller than PAGE_SIZE we currently have a problem with unwritten extents. If a we have multi-block page for which an unwritten extent has been allocated, and only some of the buffers have been written to, and they are not contiguous, we can expose stale data from disk in the blocks between the writes after extent conversion. Example of a page with unwritten and real data. buffer content 0 empty b_state = 0 1 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 2 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 3 empty b_state = 0 4 empty b_state = 0 5 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 6 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 7 empty b_state = 0 Buffers 1, 2, 5, and 6 have been written to, leaving 0, 3, 4, and 7 empty. Currently buffers 1, 2, 5, and 6 are added to a single ioend, and when IO has completed, extent conversion creates a real extent from block 1 through block 6, leaving 0 and 7 unwritten. However buffers 3 and 4 were not written to disk, so stale data is exposed from those blocks on a subsequent read. Fix this by setting iomap_valid = 0 when we find a buffer that is not Uptodate. This ensures that buffers 5 and 6 are not added to the same ioend as buffers 1 and 2. Later these blocks will be converted into two separate real extents, leaving the blocks in between unwritten. Signed-off-by: Alain Renaud Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index ae31c313a79..8dad722c004 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -981,10 +981,15 @@ xfs_vm_writepage( imap_valid = 0; } } else { - if (PageUptodate(page)) { + if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); - imap_valid = 0; - } + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + imap_valid = 0; continue; } -- cgit v1.2.3 From 0f2cf9d3d917b269645902506adaa4ff92b5e506 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Thu, 7 Jun 2012 15:44:32 +0800 Subject: xfs: fix debug_object WARN at xfs_alloc_vextent() Fengguang reports: [ 780.529603] XFS (vdd): Ending clean mount [ 781.454590] ODEBUG: object is on stack, but not annotated [ 781.455433] ------------[ cut here ]------------ [ 781.455433] WARNING: at /c/kernel-tests/sound/lib/debugobjects.c:301 __debug_object_init+0x173/0x1f1() [ 781.455433] Hardware name: Bochs [ 781.455433] Modules linked in: [ 781.455433] Pid: 26910, comm: kworker/0:2 Not tainted 3.4.0+ #51 [ 781.455433] Call Trace: [ 781.455433] [] warn_slowpath_common+0x83/0x9b [ 781.455433] [] warn_slowpath_null+0x1a/0x1c [ 781.455433] [] __debug_object_init+0x173/0x1f1 [ 781.455433] [] debug_object_init+0x14/0x16 [ 781.455433] [] __init_work+0x20/0x22 [ 781.455433] [] xfs_alloc_vextent+0x6c/0xd5 Use INIT_WORK_ONSTACK in xfs_alloc_vextent instead of INIT_WORK. Reported-by: Wu Fengguang Signed-off-by: Jie Liu Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 229641fb8e6..a996e398692 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2441,7 +2441,7 @@ xfs_alloc_vextent( DECLARE_COMPLETION_ONSTACK(done); args->done = &done; - INIT_WORK(&args->work, xfs_alloc_vextent_worker); + INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); queue_work(xfs_alloc_wq, &args->work); wait_for_completion(&done); return args->result; -- cgit v1.2.3 From d2c2819117176e139dc761873c664aaa770c79c9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jun 2012 15:44:53 +1000 Subject: xfs: m_maxioffset is redundant The m_maxioffset field in the struct xfs_mount contains the same value as the superblock s_maxbytes field. There is no need to carry two copies of this limit around, so use the VFS superblock version. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 12 ++++++------ fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_mount.c | 2 -- fs/xfs/xfs_mount.h | 3 +-- 4 files changed, 9 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8dad722c004..84e372596d5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -323,10 +323,10 @@ xfs_map_blocks( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_maxioffset); + ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; + if (offset + count > mp->m_super->s_maxbytes) + count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, @@ -1162,9 +1162,9 @@ __xfs_get_blocks( lockmode = xfs_ilock_map_shared(ip); } - ASSERT(offset <= mp->m_maxioffset); - if (offset + size > mp->m_maxioffset) - size = mp->m_maxioffset - offset; + ASSERT(offset <= mp->m_super->s_maxbytes); + if (offset + size > mp->m_super->s_maxbytes) + size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index aadfce6681e..4590cd1da43 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -416,8 +416,8 @@ retry: * Make sure preallocation does not create extents beyond the range we * actually support in this filesystem. */ - if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) - last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); ASSERT(last_fsb > offset_fsb); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 536021fb3d4..9536fd19019 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1200,8 +1200,6 @@ xfs_mountfs( xfs_set_maxicount(mp); - mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); - error = xfs_uuid_mount(mp); if (error) goto out; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8b89c5ac72d..47c6b3b3eb9 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -176,7 +176,6 @@ typedef struct xfs_mount { uint m_qflags; /* quota status flags */ xfs_trans_reservations_t m_reservations;/* precomputed res values */ __uint64_t m_maxicount; /* maximum inode count */ - __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ @@ -297,7 +296,7 @@ xfs_preferred_iosize(xfs_mount_t *mp) PAGE_CACHE_SIZE)); } -#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) +#define XFS_MAXIOFFSET(mp) ((mp)->m_super->s_maxbytes) #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) -- cgit v1.2.3 From 32972383ca46223aa2b129826b3789721ec147aa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jun 2012 15:44:54 +1000 Subject: xfs: make largest supported offset less shouty XFS_MAXIOFFSET() is just a simple macro that resolves to mp->m_maxioffset. It doesn't need to exist, and it just makes the code unnecessarily loud and shouty. Make it quiet and easy to read. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_iomap.c | 2 +- fs/xfs/xfs_mount.h | 2 -- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_vnodeops.c | 10 +++++----- 7 files changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 58b815ec8c9..848ffa77707 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5517,7 +5517,7 @@ xfs_getbmap( if (xfs_get_extsz_hint(ip) || ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; - fixlen = XFS_MAXIOFFSET(mp); + fixlen = mp->m_super->s_maxbytes; } else { prealloced = 0; fixlen = XFS_ISIZE(ip); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9f7ec15a652..59e22c989cd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -273,7 +273,7 @@ xfs_file_aio_read( } } - n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + n = mp->m_super->s_maxbytes - iocb->ki_pos; if (n <= 0 || size == 0) return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a59eea09930..257f3c463e0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1226,7 +1226,7 @@ xfs_itruncate_extents( * then there is nothing to do. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (first_unmap_block == last_block) return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 4590cd1da43..915edf6639f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -285,7 +285,7 @@ xfs_iomap_eof_want_preallocate( * do any speculative allocation. */ start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 47c6b3b3eb9..90a45305407 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -296,8 +296,6 @@ xfs_preferred_iosize(xfs_mount_t *mp) PAGE_CACHE_SIZE)); } -#define XFS_MAXIOFFSET(mp) ((mp)->m_super->s_maxbytes) - #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 249db198776..2e86fa0cfc0 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -940,7 +940,7 @@ xfs_qm_dqiterate( map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); lblkno = 0; - maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { nmaps = XFS_DQITER_MAP_SIZE; /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b6a82d817a8..c22f4e0ecac 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -174,7 +174,7 @@ xfs_free_eofblocks( * of the file. If not, then there is nothing to do. */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return 0; map_len = last_fsb - end_fsb; @@ -2262,10 +2262,10 @@ xfs_change_file_space( llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; - if ( (bf->l_start < 0) - || (bf->l_start > XFS_MAXIOFFSET(mp)) - || (bf->l_start + llen < 0) - || (bf->l_start + llen > XFS_MAXIOFFSET(mp))) + if (bf->l_start < 0 || + bf->l_start > mp->m_super->s_maxbytes || + bf->l_start + llen < 0 || + bf->l_start + llen > mp->m_super->s_maxbytes) return XFS_ERROR(EINVAL); bf->l_whence = 0; -- cgit v1.2.3 From 5276432997feb2366ac1e77949e94fe86a394813 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jun 2012 15:45:44 +1000 Subject: xfs: kill copy and paste segment checks in xfs_file_aio_read The generic segment check code now returns a count of the number of bytes in the iovec, so we don't need to roll our own anymore. Signed-off-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 59e22c989cd..c4559c6e6f2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -236,7 +236,6 @@ xfs_file_aio_read( ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; - unsigned long seg; XFS_STATS_INC(xs_read_calls); @@ -247,19 +246,9 @@ xfs_file_aio_read( if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - /* START copy & waste from filemap.c */ - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ + ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); + if (ret < 0) + return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = -- cgit v1.2.3 From 51c84223af604ce2d00d0416c30a38c50aed00bd Mon Sep 17 00:00:00 2001 From: Chen Baozi Date: Sat, 26 May 2012 00:48:47 +0800 Subject: xfs: fix typo in comment of xfs_dinode_t. There should be "XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR" instead of "XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR". Signed-off-by: Chen Baozi Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_dinode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index a3721633abc..1d9643b3dce 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -33,7 +33,7 @@ typedef struct xfs_timestamp { * variable size the leftover area split into a data and an attribute fork. * The format of the data and attribute fork depends on the format of the * inode as indicated by di_format and di_aformat. To access the data and - * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros * below. * * There is a very similar struct icdinode in xfs_inode which matches the -- cgit v1.2.3 From 76e8f1386673b864cfca3c24c4d5814740e76465 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 11 Jun 2012 10:39:43 -0400 Subject: xfs: check for stale inode before acquiring iflock on push An inode in the AIL can be flush locked and marked stale if a cluster free transaction occurs at the right time. The inode item is then marked as flushing, which causes xfsaild to spin and leaves the filesystem stalled. This is reproduced by running xfstests 273 in a loop for an extended period of time. Check for stale inodes before the flush lock. This marks the inode as pinned, leads to a log flush and allows the filesystem to proceed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode_item.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6cdbf90c6f7..d041d47d9d8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -504,6 +504,14 @@ xfs_inode_item_push( goto out_unlock; } + /* + * Stale inode items should force out the iclog. + */ + if (ip->i_flags & XFS_ISTALE) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + /* * Someone else is already flushing the inode. Nothing we can do * here but wait for the flush to finish and remove the item from @@ -514,15 +522,6 @@ xfs_inode_item_push( goto out_unlock; } - /* - * Stale inode items should force out the iclog. - */ - if (ip->i_flags & XFS_ISTALE) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return XFS_ITEM_PINNED; - } - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); -- cgit v1.2.3 From 079da28c64ebeca357adae77aea3ae7160e45d98 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Jun 2012 14:20:26 +1000 Subject: xfs: fix allocbt cursor leak in xfs_alloc_ag_vextent_near When we fail to find an matching extent near the requested extent specification during a left-right distance search in xfs_alloc_ag_vextent_near, we fail to free the original cursor that we used to look up the XFS_BTNUM_CNT tree and hence leak it. Reported-by: Chris J Arges Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a996e398692..9d1aeb7e273 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1080,6 +1080,7 @@ restart: goto restart; } + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; -- cgit v1.2.3 From bcf62ab64d1ba257dd9d4283a077a7219a05073a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Jun 2012 00:32:26 +0200 Subject: xfs: Fix overallocation in xfs_buf_allocate_memory() Commit de1cbee which removed b_file_offset in favor of b_bn introduced a bug causing xfs_buf_allocate_memory() to overestimate the number of necessary pages. The problem is that xfs_buf_alloc() sets b_bn to -1 and thus effectively every buffer is straddling a page boundary which causes xfs_buf_allocate_memory() to allocate two pages and use vmalloc() for access which is unnecessary. Dave says xfs_buf_alloc() doesn't need to set b_bn to -1 anymore since the buffer is inserted into the cache only after being fully initialized now. So just make xfs_buf_alloc() fill in proper block number from the beginning. CC: David Chinner Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 172d3cc8f8c..a4beb421018 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -201,14 +201,7 @@ xfs_buf_alloc( bp->b_length = numblks; bp->b_io_length = numblks; bp->b_flags = flags; - - /* - * We do not set the block number here in the buffer because we have not - * finished initialising the buffer. We insert the buffer into the cache - * in this state, so this ensures that we are unable to do IO on a - * buffer that hasn't been fully initialised. - */ - bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_bn = blkno; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -567,11 +560,6 @@ xfs_buf_get( if (bp != new_bp) xfs_buf_free(new_bp); - /* - * Now we have a workable buffer, fill in the block number so - * that we can do IO on it. - */ - bp->b_bn = blkno; bp->b_io_length = bp->b_length; found: @@ -772,7 +760,7 @@ xfs_buf_get_uncached( int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, numblks, 0); + bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); if (unlikely(bp == NULL)) goto fail; -- cgit v1.2.3 From 11159a0500c1eb7a8a2de37b7dceb53373c75350 Mon Sep 17 00:00:00 2001 From: Ben Myers Date: Fri, 25 May 2012 15:45:36 -0500 Subject: xfs: shutdown xfs_sync_worker before the log Revert commit 1307bbd, which uses the s_umount semaphore to provide exclusion between xfs_sync_worker and unmount, in favor of shutting down the sync worker before freeing the log in xfs_log_unmount. This is a cleaner way of resolving the race between xfs_sync_worker and unmount than using s_umount. Signed-off-by: Ben Myers Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_sync.c | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f30d9807dc4..0e1a64f0439 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -810,6 +810,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { + cancel_delayed_work_sync(&mp->m_sync_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index c9d3409c5ca..1e9ee064dbb 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -386,23 +386,23 @@ xfs_sync_worker( * We shouldn't write/force the log if we are in the mount/unmount * process or on a read only filesystem. The workqueue still needs to be * active in both cases, however, because it is used for inode reclaim - * during these times. Use the s_umount semaphore to provide exclusion - * with unmount. + * during these times. Use the MS_ACTIVE flag to avoid doing anything + * during mount. Doing work during unmount is avoided by calling + * cancel_delayed_work_sync on this work queue before tearing down + * the ail and the log in xfs_log_unmount. */ - if (down_read_trylock(&mp->m_super->s_umount)) { - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently - * dirty */ - xfs_ail_push_all(mp->m_ail); - } - up_read(&mp->m_super->s_umount); + if (!(mp->m_super->s_flags & MS_ACTIVE) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* dgc: errors ignored here */ + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently + * dirty */ + xfs_ail_push_all(mp->m_ail); } /* queue us up again */ -- cgit v1.2.3 From ad223e6030be017470e46f153de27a43979759e0 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 14 Jun 2012 09:22:15 -0500 Subject: xfs: rename log structure to xlog Rename the XFS log structure to xlog to help crash distinquish it from the other logs in Linux. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 76 ++++++++++++++++++++++++++++-------------------- fs/xfs/xfs_log_cil.c | 22 +++++++------- fs/xfs/xfs_log_priv.h | 46 +++++++++++++++++++---------- fs/xfs/xfs_log_recover.c | 38 ++++++++++++------------ fs/xfs/xfs_mount.h | 4 +-- fs/xfs/xfs_trace.h | 18 ++++++------ 6 files changed, 116 insertions(+), 88 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0e1a64f0439..d90d4a38860 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -38,13 +38,21 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, - xlog_in_core_t **, xfs_lsn_t *); +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp); + STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int xlog_space_left(struct log *log, atomic64_t *head); +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); @@ -64,8 +72,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log, int eventual_size); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_grant_push_ail(struct log *log, - int need_bytes); +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes); STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket); STATIC void xlog_ungrant_log_space(xlog_t *log, @@ -73,7 +83,9 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); -STATIC void xlog_verify_grant_tail(struct log *log); +STATIC void +xlog_verify_grant_tail( + struct xlog *log); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, @@ -89,9 +101,9 @@ STATIC int xlog_iclogs_empty(xlog_t *log); static void xlog_grant_sub_space( - struct log *log, - atomic64_t *head, - int bytes) + struct xlog *log, + atomic64_t *head, + int bytes) { int64_t head_val = atomic64_read(head); int64_t new, old; @@ -115,9 +127,9 @@ xlog_grant_sub_space( static void xlog_grant_add_space( - struct log *log, - atomic64_t *head, - int bytes) + struct xlog *log, + atomic64_t *head, + int bytes) { int64_t head_val = atomic64_read(head); int64_t new, old; @@ -165,7 +177,7 @@ xlog_grant_head_wake_all( static inline int xlog_ticket_reservation( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic) { @@ -182,7 +194,7 @@ xlog_ticket_reservation( STATIC bool xlog_grant_head_wake( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, int *free_bytes) { @@ -204,7 +216,7 @@ xlog_grant_head_wake( STATIC int xlog_grant_head_wait( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, int need_bytes) @@ -256,7 +268,7 @@ shutdown: */ STATIC int xlog_grant_head_check( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, int *need_bytes) @@ -323,7 +335,7 @@ xfs_log_regrant( struct xfs_mount *mp, struct xlog_ticket *tic) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int need_bytes; int error = 0; @@ -389,7 +401,7 @@ xfs_log_reserve( bool permanent, uint t_type) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_ticket *tic; int need_bytes; int error = 0; @@ -465,7 +477,7 @@ xfs_log_done( struct xlog_in_core **iclog, uint flags) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || @@ -839,7 +851,7 @@ void xfs_log_space_wake( struct xfs_mount *mp) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) @@ -917,7 +929,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked( struct xfs_mount *mp) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xfs_log_item *lip; xfs_lsn_t tail_lsn; @@ -966,7 +978,7 @@ xlog_assign_tail_lsn( */ STATIC int xlog_space_left( - struct log *log, + struct xlog *log, atomic64_t *head) { int free_bytes; @@ -1278,7 +1290,7 @@ out: */ STATIC int xlog_commit_record( - struct log *log, + struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp) @@ -1312,7 +1324,7 @@ xlog_commit_record( */ STATIC void xlog_grant_push_ail( - struct log *log, + struct xlog *log, int need_bytes) { xfs_lsn_t threshold_lsn = 0; @@ -1791,7 +1803,7 @@ xlog_write_start_rec( static xlog_op_header_t * xlog_write_setup_ophdr( - struct log *log, + struct xlog *log, struct xlog_op_header *ophdr, struct xlog_ticket *ticket, uint flags) @@ -1874,7 +1886,7 @@ xlog_write_setup_copy( static int xlog_write_copy_finish( - struct log *log, + struct xlog *log, struct xlog_in_core *iclog, uint flags, int *record_cnt, @@ -1959,7 +1971,7 @@ xlog_write_copy_finish( */ int xlog_write( - struct log *log, + struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, @@ -2822,7 +2834,7 @@ _xfs_log_force( uint flags, int *log_flushed) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_in_core *iclog; xfs_lsn_t lsn; @@ -2970,7 +2982,7 @@ _xfs_log_force_lsn( uint flags, int *log_flushed) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_in_core *iclog; int already_slept = 0; @@ -3148,7 +3160,7 @@ xfs_log_ticket_get( */ xlog_ticket_t * xlog_ticket_alloc( - struct log *log, + struct xlog *log, int unit_bytes, int cnt, char client, @@ -3279,7 +3291,7 @@ xlog_ticket_alloc( */ void xlog_verify_dest_ptr( - struct log *log, + struct xlog *log, char *ptr) { int i; @@ -3308,7 +3320,7 @@ xlog_verify_dest_ptr( */ STATIC void xlog_verify_grant_tail( - struct log *log) + struct xlog *log) { int tail_cycle, tail_blocks; int cycle, space; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7d6197c5849..ddc4529d07d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -44,7 +44,7 @@ */ static struct xlog_ticket * xlog_cil_ticket_alloc( - struct log *log) + struct xlog *log) { struct xlog_ticket *tic; @@ -72,7 +72,7 @@ xlog_cil_ticket_alloc( */ void xlog_cil_init_post_recovery( - struct log *log) + struct xlog *log) { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; @@ -182,7 +182,7 @@ xlog_cil_prepare_log_vecs( */ STATIC void xfs_cil_prepare_item( - struct log *log, + struct xlog *log, struct xfs_log_vec *lv, int *len, int *diff_iovecs) @@ -231,7 +231,7 @@ xfs_cil_prepare_item( */ static void xlog_cil_insert_items( - struct log *log, + struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket) { @@ -373,7 +373,7 @@ xlog_cil_committed( */ STATIC int xlog_cil_push( - struct log *log) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -601,7 +601,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct log *log) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; @@ -629,7 +629,7 @@ xlog_cil_push_background( static void xlog_cil_push_foreground( - struct log *log, + struct xlog *log, xfs_lsn_t push_seq) { struct xfs_cil *cil = log->l_cilp; @@ -683,7 +683,7 @@ xfs_log_commit_cil( xfs_lsn_t *commit_lsn, int flags) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int log_flags = 0; struct xfs_log_vec *log_vector; @@ -754,7 +754,7 @@ xfs_log_commit_cil( */ xfs_lsn_t xlog_cil_force_lsn( - struct log *log, + struct xlog *log, xfs_lsn_t sequence) { struct xfs_cil *cil = log->l_cilp; @@ -833,7 +833,7 @@ xfs_log_item_in_current_chkpt( */ int xlog_cil_init( - struct log *log) + struct xlog *log) { struct xfs_cil *cil; struct xfs_cil_ctx *ctx; @@ -869,7 +869,7 @@ xlog_cil_init( void xlog_cil_destroy( - struct log *log) + struct xlog *log) { if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 5bc33261f5b..72eba2201b1 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -19,7 +19,7 @@ #define __XFS_LOG_PRIV_H__ struct xfs_buf; -struct log; +struct xlog; struct xlog_ticket; struct xfs_mount; @@ -352,7 +352,7 @@ typedef struct xlog_in_core { struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; - struct log *ic_log; + struct xlog *ic_log; int ic_size; int ic_offset; int ic_bwritecnt; @@ -409,7 +409,7 @@ struct xfs_cil_ctx { * operations almost as efficient as the old logging methods. */ struct xfs_cil { - struct log *xc_log; + struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; struct xfs_cil_ctx *xc_ctx; @@ -487,7 +487,7 @@ struct xlog_grant_head { * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct log { +typedef struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ @@ -553,9 +553,14 @@ extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; -struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, - int count, char client, bool permanent, - xfs_km_flags_t alloc_flags); +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); static inline void @@ -567,9 +572,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) } void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); -int xlog_write(struct log *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, uint flags); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -629,17 +639,23 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) /* * Committed Item List interfaces */ -int xlog_cil_init(struct log *log); -void xlog_cil_init_post_recovery(struct log *log); -void xlog_cil_destroy(struct log *log); +int +xlog_cil_init(struct xlog *log); +void +xlog_cil_init_post_recovery(struct xlog *log); +void +xlog_cil_destroy(struct xlog *log); /* * CIL force routines */ -xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); static inline void -xlog_cil_force(struct log *log) +xlog_cil_force(struct xlog *log) { xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ca386909131..a7be98abd6a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1471,8 +1471,8 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1517,8 +1517,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1588,8 +1588,8 @@ xlog_recover_add_to_trans( */ STATIC int xlog_recover_reorder_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { xlog_recover_item_t *item, *n; @@ -1642,8 +1642,8 @@ xlog_recover_reorder_trans( */ STATIC int xlog_recover_buffer_pass1( - struct log *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; struct list_head *bucket; @@ -1696,7 +1696,7 @@ xlog_recover_buffer_pass1( */ STATIC int xlog_check_buffer_cancelled( - struct log *log, + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) @@ -2689,9 +2689,9 @@ xlog_recover_free_trans( STATIC int xlog_recover_commit_pass1( - struct log *log, - struct xlog_recover *trans, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); @@ -2716,10 +2716,10 @@ xlog_recover_commit_pass1( STATIC int xlog_recover_commit_pass2( - struct log *log, - struct xlog_recover *trans, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); @@ -2753,7 +2753,7 @@ xlog_recover_commit_pass2( */ STATIC int xlog_recover_commit_trans( - struct log *log, + struct xlog *log, struct xlog_recover *trans, int pass) { @@ -2793,8 +2793,8 @@ out: STATIC int xlog_recover_unmount_trans( - struct log *log, - xlog_recover_t *trans) + struct xlog *log, + struct xlog_recover *trans) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 90a45305407..8724336a9a0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -53,7 +53,7 @@ typedef struct xfs_trans_reservations { #include "xfs_sync.h" -struct log; +struct xlog; struct xfs_mount_args; struct xfs_inode; struct xfs_bmbt_irec; @@ -133,7 +133,7 @@ typedef struct xfs_mount { uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ - struct log *m_log; /* log specific stuff */ + struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7cf9d3529e5..caf5dabfd55 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -32,7 +32,7 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xfs_log_item; struct xlog_ticket; -struct log; +struct xlog; struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; @@ -762,7 +762,7 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); DECLARE_EVENT_CLASS(xfs_loggrant_class, - TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) @@ -830,7 +830,7 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, #define DEFINE_LOGGRANT_EVENT(name) \ DEFINE_EVENT(xfs_loggrant_class, name, \ - TP_PROTO(struct log *log, struct xlog_ticket *tic), \ + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); @@ -1664,7 +1664,7 @@ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); DECLARE_EVENT_CLASS(xfs_log_recover_item_class, - TP_PROTO(struct log *log, struct xlog_recover *trans, + TP_PROTO(struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item, int pass), TP_ARGS(log, trans, item, pass), TP_STRUCT__entry( @@ -1698,7 +1698,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, #define DEFINE_LOG_RECOVER_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_item_class, name, \ - TP_PROTO(struct log *log, struct xlog_recover *trans, \ + TP_PROTO(struct xlog *log, struct xlog_recover *trans, \ struct xlog_recover_item *item, int pass), \ TP_ARGS(log, trans, item, pass)) @@ -1709,7 +1709,7 @@ DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, - TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), TP_ARGS(log, buf_f), TP_STRUCT__entry( __field(dev_t, dev) @@ -1739,7 +1739,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, #define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ - TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \ TP_ARGS(log, buf_f)) DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); @@ -1752,7 +1752,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, - TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), TP_ARGS(log, in_f), TP_STRUCT__entry( __field(dev_t, dev) @@ -1790,7 +1790,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, ) #define DEFINE_LOG_RECOVER_INO_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ - TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \ TP_ARGS(log, in_f)) DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); -- cgit v1.2.3 From 9a8d2fdbb47aaa1eaa136b89da5e5e6b60015c78 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 14 Jun 2012 09:22:16 -0500 Subject: xfs: remove xlog_t typedef Remove the xlog_t type definitions. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 223 +++++++++++++++++++++++++++++------------------ fs/xfs/xfs_log_priv.h | 18 ++-- fs/xfs/xfs_log_recover.c | 140 +++++++++++++++-------------- 3 files changed, 224 insertions(+), 157 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d90d4a38860..7f4f9370d0e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -45,51 +45,85 @@ xlog_commit_record( struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp); -STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks); +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); STATIC int xlog_space_left( struct xlog *log, atomic64_t *head); -STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_dealloc_log(xlog_t *log); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); -STATIC int xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclog, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp); -STATIC int xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog); -STATIC void xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size); -STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); STATIC void xlog_grant_push_ail( - struct xlog *log, - int need_bytes); -STATIC void xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket); -STATIC void xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket); + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); STATIC void xlog_verify_grant_tail( - struct xlog *log); -STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, - int count, boolean_t syncing); -STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn); + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + boolean_t syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) @@ -97,7 +131,9 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, #define xlog_verify_tail_lsn(a,b,c) #endif -STATIC int xlog_iclogs_empty(xlog_t *log); +STATIC int +xlog_iclogs_empty( + struct xlog *log); static void xlog_grant_sub_space( @@ -684,7 +720,7 @@ xfs_log_mount_finish(xfs_mount_t *mp) int xfs_log_unmount_write(xfs_mount_t *mp) { - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; xlog_in_core_t *iclog; #ifdef DEBUG xlog_in_core_t *first_iclog; @@ -893,7 +929,7 @@ int xfs_log_need_covered(xfs_mount_t *mp) { int needed = 0; - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; if (!xfs_fs_writable(mp)) return 0; @@ -1024,9 +1060,9 @@ xlog_space_left( void xlog_iodone(xfs_buf_t *bp) { - xlog_in_core_t *iclog = bp->b_fspriv; - xlog_t *l = iclog->ic_log; - int aborted = 0; + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; /* * Race to shutdown the filesystem if we see an error. @@ -1067,8 +1103,9 @@ xlog_iodone(xfs_buf_t *bp) */ STATIC void -xlog_get_iclog_buffer_size(xfs_mount_t *mp, - xlog_t *log) +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) { int size; int xhdrs; @@ -1129,13 +1166,14 @@ done: * Its primary purpose is to fill in enough, so recovery can occur. However, * some other stuff may be filled in too. */ -STATIC xlog_t * -xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - xlog_t *log; + struct xlog *log; xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; @@ -1144,7 +1182,7 @@ xlog_alloc_log(xfs_mount_t *mp, int error = ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1434,8 +1472,9 @@ xlog_bdstrat( */ STATIC int -xlog_sync(xlog_t *log, - xlog_in_core_t *iclog) +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) { xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; @@ -1584,7 +1623,8 @@ xlog_sync(xlog_t *log, * Deallocate a log structure */ STATIC void -xlog_dealloc_log(xlog_t *log) +xlog_dealloc_log( + struct xlog *log) { xlog_in_core_t *iclog, *next_iclog; int i; @@ -1616,10 +1656,11 @@ xlog_dealloc_log(xlog_t *log) */ /* ARGSUSED */ static inline void -xlog_state_finish_copy(xlog_t *log, - xlog_in_core_t *iclog, - int record_cnt, - int copy_bytes) +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) { spin_lock(&log->l_icloglock); @@ -2142,7 +2183,8 @@ xlog_write( * State Change: DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log(xlog_t *log) +xlog_state_clean_log( + struct xlog *log) { xlog_in_core_t *iclog; int changed = 0; @@ -2222,7 +2264,7 @@ xlog_state_clean_log(xlog_t *log) STATIC xfs_lsn_t xlog_get_lowest_lsn( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *lsn_log; xfs_lsn_t lowest_lsn, lsn; @@ -2245,9 +2287,9 @@ xlog_get_lowest_lsn( STATIC void xlog_state_do_callback( - xlog_t *log, - int aborted, - xlog_in_core_t *ciclog) + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've @@ -2467,7 +2509,7 @@ xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { - xlog_t *log = iclog->ic_log; + struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); @@ -2521,12 +2563,13 @@ xlog_state_done_syncing( * is copied. */ STATIC int -xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclogp, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp) +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) { int log_offset; xlog_rec_header_t *head; @@ -2631,8 +2674,9 @@ restart: * move grant reservation head forward. */ STATIC void -xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { trace_xfs_log_regrant_reserve_enter(log, ticket); @@ -2677,8 +2721,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, * in the current reservation field. */ STATIC void -xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { int bytes; @@ -2717,8 +2762,8 @@ xlog_ungrant_log_space(xlog_t *log, */ STATIC int xlog_state_release_iclog( - xlog_t *log, - xlog_in_core_t *iclog) + struct xlog *log, + struct xlog_in_core *iclog) { int sync = 0; /* do we sync? */ @@ -2768,9 +2813,10 @@ xlog_state_release_iclog( * that every data block. We have run out of space in this log record. */ STATIC void -xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size) +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); if (!eventual_size) @@ -3114,7 +3160,9 @@ xfs_log_force_lsn( * disk. */ STATIC void -xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) { assert_spin_locked(&log->l_icloglock); @@ -3158,7 +3206,7 @@ xfs_log_ticket_get( /* * Allocate and initialise a new log ticket. */ -xlog_ticket_t * +struct xlog_ticket * xlog_ticket_alloc( struct xlog *log, int unit_bytes, @@ -3346,9 +3394,10 @@ xlog_verify_grant_tail( /* check if it will fit */ STATIC void -xlog_verify_tail_lsn(xlog_t *log, - xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn) +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) { int blocks; @@ -3385,10 +3434,11 @@ xlog_verify_tail_lsn(xlog_t *log, * the cycle numbers agree with the current cycle number. */ STATIC void -xlog_verify_iclog(xlog_t *log, - xlog_in_core_t *iclog, - int count, - boolean_t syncing) +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + boolean_t syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3482,7 +3532,7 @@ xlog_verify_iclog(xlog_t *log, */ STATIC int xlog_state_ioerror( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *iclog, *ic; @@ -3527,7 +3577,7 @@ xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_t *log; + struct xlog *log; int retval; log = mp->m_log; @@ -3634,7 +3684,8 @@ xfs_log_force_umount( } STATIC int -xlog_iclogs_empty(xlog_t *log) +xlog_iclogs_empty( + struct xlog *log) { xlog_in_core_t *iclog; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 72eba2201b1..18a801d76a4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -487,7 +487,7 @@ struct xlog_grant_head { * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct xlog { +struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ @@ -540,7 +540,7 @@ typedef struct xlog { char *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif -} xlog_t; +}; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) @@ -548,9 +548,17 @@ typedef struct xlog { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ -extern int xlog_recover(xlog_t *log); -extern int xlog_recover_finish(xlog_t *log); -extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); +extern void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a7be98abd6a..a76ba886e73 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -43,10 +43,18 @@ #include "xfs_utils.h" #include "xfs_trace.h" -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) #endif @@ -74,7 +82,7 @@ struct xfs_buf_cancel { static inline int xlog_buf_bbcount_valid( - xlog_t *log, + struct xlog *log, int bbcount) { return bbcount > 0 && bbcount <= log->l_logBBsize; @@ -87,7 +95,7 @@ xlog_buf_bbcount_valid( */ STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, + struct xlog *log, int nbblks) { struct xfs_buf *bp; @@ -138,10 +146,10 @@ xlog_put_bp( */ STATIC xfs_caddr_t xlog_align( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); @@ -155,10 +163,10 @@ xlog_align( */ STATIC int xlog_bread_noalign( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -189,10 +197,10 @@ xlog_bread_noalign( STATIC int xlog_bread( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t *offset) { int error; @@ -211,10 +219,10 @@ xlog_bread( */ STATIC int xlog_bread_offset( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; @@ -241,10 +249,10 @@ xlog_bread_offset( */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -378,8 +386,8 @@ xlog_recover_iodone( */ STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) @@ -421,7 +429,7 @@ xlog_find_cycle_start( */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -490,7 +498,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -600,7 +608,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -871,7 +879,7 @@ validate_head: */ STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -1080,7 +1088,7 @@ done: */ STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -1183,7 +1191,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1205,7 +1213,7 @@ xlog_add_record( STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1305,7 +1313,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -2050,11 +2058,11 @@ xfs_qm_dqcheck( */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; @@ -2108,9 +2116,9 @@ xlog_recover_do_dquot_buffer( */ STATIC int xlog_recover_buffer_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; @@ -2189,9 +2197,9 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; @@ -2452,14 +2460,14 @@ error: } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int xlog_recover_quotaoff_pass1( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); @@ -2483,9 +2491,9 @@ xlog_recover_quotaoff_pass1( */ STATIC int xlog_recover_dquot_pass2( - xlog_t *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; @@ -2578,9 +2586,9 @@ xlog_recover_dquot_pass2( */ STATIC int xlog_recover_efi_pass2( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn) + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; xfs_mount_t *mp = log->l_mp; @@ -2616,8 +2624,8 @@ xlog_recover_efi_pass2( */ STATIC int xlog_recover_efd_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2812,9 +2820,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, + struct xlog *log, struct hlist_head rhash[], - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2986,7 +2994,7 @@ abort_error: */ STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; @@ -3147,7 +3155,7 @@ xlog_recover_process_one_iunlink( */ STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; @@ -3209,9 +3217,9 @@ xlog_recover_process_iunlinks( #ifdef DEBUG STATIC void xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) + struct xlog *log, + struct xlog_in_core *iclog, + int size) { int i; __be32 *up; @@ -3234,8 +3242,8 @@ xlog_pack_data_checksum( */ void xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, + struct xlog *log, + struct xlog_in_core *iclog, int roundoff) { int i, j, k; @@ -3274,9 +3282,9 @@ xlog_pack_data( STATIC void xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; @@ -3299,8 +3307,8 @@ xlog_unpack_data( STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; @@ -3343,7 +3351,7 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) @@ -3595,7 +3603,7 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3646,7 +3654,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3721,7 +3729,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3767,7 +3775,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3814,7 +3822,7 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; -- cgit v1.2.3 From 77c1a08fc9ece4cb130b9fd279738e799f0c2864 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:07 +1000 Subject: xfs: struct xfs_buf_log_format isn't variable sized. The struct xfs_buf_log_format wants to think the dirty bitmap is variable sized. In fact, it is variable size on disk simply due to the way we map it from the in-memory structure, but we still just use a fixed size memory allocation for the in-memory structure. Hence it makes no sense to set the function up as a variable sized structure when we already know it's maximum size, and we always allocate it as such. Simplify the structure by making the dirty bitmap a fixed sized array and just using the size of the structure for the allocation size. This will make it much simpler to allocate and manipulate an array of format structures for discontiguous buffer support. The previous struct xfs_buf_log_item size according to /proc/slabinfo was 224 bytes. pahole doesn't give the same size because of the variable size definition. With this modification, pahole reports the same as /proc/slabinfo: /* size: 224, cachelines: 4, members: 6 */ Because the xfs_buf_log_item size is now determined by the maximum supported block size we introduce a dependency on xfs_alloc_btree.h. Avoid this dependency by moving the idefines for the maximum block sizes supported to xfs_types.h with all the other max/min type defines to avoid any new dependencies. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.h | 14 -------------- fs/xfs/xfs_buf_item.c | 14 ++++++-------- fs/xfs/xfs_buf_item.h | 36 ++++++++++++++++++------------------ fs/xfs/xfs_super.c | 5 ++--- fs/xfs/xfs_types.h | 14 ++++++++++++++ 5 files changed, 40 insertions(+), 43 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index a6caa0022c9..359fb86ed87 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -50,20 +50,6 @@ typedef struct xfs_alloc_rec_incore { /* btree pointer type */ typedef __be32 xfs_alloc_ptr_t; -/* - * Minimum and maximum blocksize and sectorsize. - * The blocksize upper limit is pretty much arbitrary. - * The sectorsize upper limit is due to sizeof(sb_sectsize). - */ -#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ -#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) -#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) -#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ -#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) -#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) - /* * Block numbers in the AG: * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 45df2b857d4..52cd8f89ee7 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -240,15 +240,13 @@ xfs_buf_item_format( (bip->bli_flags & XFS_BLI_STALE)); /* - * The size of the base structure is the size of the - * declared structure plus the space for the extra words - * of the bitmap. We subtract one from the map size, because - * the first element of the bitmap is accounted for in the - * size of the base structure. + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. */ - base_size = - (uint)(sizeof(xfs_buf_log_format_t) + - ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); + base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + + (bip->bli_format.blf_map_size * + sizeof(bip->bli_format.blf_data_map[0])); vecp->i_addr = &bip->bli_format; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index b6ecd2061e7..ff268678023 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -20,23 +20,6 @@ extern kmem_zone_t *xfs_buf_item_zone; -/* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. - */ -typedef struct xfs_buf_log_format { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_t; - /* * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. @@ -60,6 +43,23 @@ typedef struct xfs_buf_log_format { #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + /* * buf log item flags */ @@ -102,7 +102,7 @@ typedef struct xfs_buf_log_item { char *bli_orig; /* original buffer copy */ char *bli_logged; /* bytes logged (bitmap) */ #endif - xfs_buf_log_format_t bli_format; /* in-log header */ + struct xfs_buf_log_format bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0d9de41a715..425f6e9d4c0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1514,9 +1514,8 @@ xfs_init_zones(void) * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / - NBWORD) * sizeof(int))), "xfs_buf_item"); + xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), + "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_log_item_desc_zone; diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 398cf681d02..7a41874f4c2 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -132,6 +132,20 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ #define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ +/* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + /* * Min numbers of data/attr fork btree root pointers. */ -- cgit v1.2.3 From cbb7baab285a540f173ef1ec3d5bcf9d0ad29d16 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:08 +1000 Subject: xfs: separate buffer indexing from block map To support discontiguous buffers in the buffer cache, we need to separate the cache index variables from the I/O map. While this is currently a 1:1 mapping, discontiguous buffer support will break this relationship. However, for caching purposes, we can still treat them the same as a contiguous buffer - the block number of the first block and the length of the buffer - as that is still a unique representation. Also, the only way we will ever access the discontiguous regions of buffers is via bulding the complete buffer in the first place, so using the initial block number and entire buffer length is a sane way to index the buffers. Add a block mapping vector construct to the xfs_buf and use it in the places where we are doing IO instead of the current b_bn/b_length variables. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 21 ++++++++++++--------- fs/xfs/xfs_buf.h | 27 +++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 13 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a4beb421018..a843873b095 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -202,6 +202,8 @@ xfs_buf_alloc( bp->b_io_length = numblks; bp->b_flags = flags; bp->b_bn = blkno; + bp->b_map.bm_bn = blkno; + bp->b_map.bm_len = numblks; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -327,8 +329,9 @@ xfs_buf_allocate_memory( } use_alloc_page: - start = BBTOB(bp->b_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) @@ -560,8 +563,6 @@ xfs_buf_get( if (bp != new_bp) xfs_buf_free(new_bp); - bp->b_io_length = bp->b_length; - found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); @@ -584,7 +585,7 @@ _xfs_buf_read( xfs_buf_flags_t flags) { ASSERT(!(flags & XBF_WRITE)); - ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -665,8 +666,8 @@ xfs_buf_read_uncached( return NULL; /* set up the buffer for a read IO */ - XFS_BUF_SET_ADDR(bp, daddr); - XFS_BUF_READ(bp); + bp->b_map.bm_bn = daddr; + bp->b_flags |= XBF_READ; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); @@ -695,6 +696,8 @@ xfs_buf_set_empty( bp->b_length = numblks; bp->b_io_length = numblks; bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_map.bm_bn = XFS_BUF_DADDR_NULL; + bp->b_map.bm_len = bp->b_length; } static inline struct page * @@ -1159,7 +1162,7 @@ _xfs_buf_ioapply( struct bio *bio; int offset = bp->b_offset; int size = BBTOB(bp->b_io_length); - sector_t sector = bp->b_bn; + sector_t sector = bp->b_map.bm_bn; total_nr_pages = bp->b_page_count; map_i = 0; @@ -1564,7 +1567,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_bn - bp->b_bn; + diff = ap->b_map.bm_bn - bp->b_map.bm_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7f1d1392ce3..c9c2ba90c53 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -58,6 +58,7 @@ typedef enum { #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ typedef unsigned int xfs_buf_flags_t; @@ -75,7 +76,8 @@ typedef unsigned int xfs_buf_flags_t; { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" } + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_COMPOUND, "COMPOUND" } typedef struct xfs_buftarg { dev_t bt_dev; @@ -98,6 +100,11 @@ typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); #define XB_PAGES 2 +struct xfs_buf_map { + xfs_daddr_t bm_bn; /* block number for I/O */ + int bm_len; /* size of I/O */ +}; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -107,7 +114,7 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_daddr_t b_bn; /* block number for I/O */ + xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ @@ -127,12 +134,14 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ + struct xfs_buf_map b_map; /* compound buffer map */ int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif @@ -233,8 +242,18 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_ADDR(bp) ((bp)->b_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) +/* + * These macros use the IO block map rather than b_bn. b_bn is now really + * just for the buffer cache index for cached buffers. As IO does not use b_bn + * anymore, uncached buffers do not use b_bn at all and hence must modify the IO + * map directly. Uncached buffers are not allowed to be discontiguous, so this + * is safe to do. + * + * In future, uncached buffers will pass the block number directly to the io + * request function and hence these macros will go away at that point. + */ +#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { -- cgit v1.2.3 From 3e85c868a697805a3d4c7800a6bacdfc81d15cdf Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:09 +1000 Subject: xfs: convert internal buffer functions to pass maps While the external interface currently uses separate blockno/length variables, we need to move internal interfaces to passing and parsing vector maps. This will then allow us to add external interfaces to support discontiguous buffer maps as the internal code will already support them. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 202 ++++++++++++++++++++++++++++++++++++++++++------------- fs/xfs/xfs_buf.h | 43 +++++++++--- 2 files changed, 191 insertions(+), 54 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a843873b095..82bb8123ab2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -164,14 +164,49 @@ xfs_buf_stale( ASSERT(atomic_read(&bp->b_hold) >= 1); } +static int +xfs_buf_get_maps( + struct xfs_buf *bp, + int map_count) +{ + ASSERT(bp->b_maps == NULL); + bp->b_map_count = map_count; + + if (map_count == 1) { + bp->b_maps = &bp->b_map; + return 0; + } + + bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), + KM_NOFS); + if (!bp->b_maps) + return ENOMEM; + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +static void +xfs_buf_free_maps( + struct xfs_buf *bp) +{ + if (bp->b_maps != &bp->b_map) { + kmem_free(bp->b_maps); + bp->b_maps = NULL; + } +} + struct xfs_buf * -xfs_buf_alloc( +_xfs_buf_alloc( struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { struct xfs_buf *bp; + int error; + int i; bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) @@ -192,18 +227,28 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; + bp->b_flags = flags; /* * Set length and io_length to the same value initially. * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_length = numblks; - bp->b_io_length = numblks; - bp->b_flags = flags; - bp->b_bn = blkno; - bp->b_map.bm_bn = blkno; - bp->b_map.bm_len = numblks; + error = xfs_buf_get_maps(bp, nmaps); + if (error) { + kmem_zone_free(xfs_buf_zone, bp); + return NULL; + } + + bp->b_bn = map[0].bm_bn; + bp->b_length = 0; + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; + } + bp->b_io_length = bp->b_length; + atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -282,6 +327,7 @@ xfs_buf_free( } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); kmem_zone_free(xfs_buf_zone, bp); } @@ -428,8 +474,8 @@ _xfs_buf_map_pages( xfs_buf_t * _xfs_buf_find( struct xfs_buftarg *btp, - xfs_daddr_t blkno, - size_t numblks, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { @@ -438,7 +484,12 @@ _xfs_buf_find( struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; + xfs_daddr_t blkno = map[0].bm_bn; + int numblks = 0; + int i; + for (i = 0; i < nmaps; i++) + numblks += map[i].bm_len; numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ @@ -539,22 +590,23 @@ xfs_buf_get( struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); + bp = _xfs_buf_find(target, &map, 1, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, blkno, numblks, flags); + new_bp = _xfs_buf_alloc(target, &map, 1, flags); if (unlikely(!new_bp)) return NULL; error = xfs_buf_allocate_memory(new_bp, flags); if (error) { - kmem_zone_free(xfs_buf_zone, new_bp); + xfs_buf_free(new_bp); return NULL; } - bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + bp = _xfs_buf_find(target, &map, 1, flags, new_bp); if (!bp) { xfs_buf_free(new_bp); return NULL; @@ -666,7 +718,9 @@ xfs_buf_read_uncached( return NULL; /* set up the buffer for a read IO */ - bp->b_map.bm_bn = daddr; + ASSERT(bp->b_map_count == 1); + bp->b_bn = daddr; + bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; xfsbdstrat(target->bt_mount, bp); @@ -695,9 +749,11 @@ xfs_buf_set_empty( bp->b_addr = NULL; bp->b_length = numblks; bp->b_io_length = numblks; + + ASSERT(bp->b_map_count == 1); bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_map.bm_bn = XFS_BUF_DADDR_NULL; - bp->b_map.bm_len = bp->b_length; + bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_len = bp->b_length; } static inline struct page * @@ -761,9 +817,10 @@ xfs_buf_get_uncached( { unsigned long page_count; int error, i; - xfs_buf_t *bp; + struct xfs_buf *bp; + DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); + bp = _xfs_buf_alloc(target, &map, 1, 0); if (unlikely(bp == NULL)) goto fail; @@ -794,6 +851,7 @@ xfs_buf_get_uncached( __free_page(bp->b_pages[i]); _xfs_buf_free_pages(bp); fail_free_buf: + xfs_buf_free_maps(bp); kmem_zone_free(xfs_buf_zone, bp); fail: return NULL; @@ -1154,36 +1212,39 @@ xfs_buf_bio_end_io( bio_put(bio); } -STATIC void -_xfs_buf_ioapply( - xfs_buf_t *bp) +static void +xfs_buf_ioapply_map( + struct xfs_buf *bp, + int map, + int *buf_offset, + int *count, + int rw) { - int rw, map_i, total_nr_pages, nr_pages; - struct bio *bio; - int offset = bp->b_offset; - int size = BBTOB(bp->b_io_length); - sector_t sector = bp->b_map.bm_bn; + int page_index; + int total_nr_pages = bp->b_page_count; + int nr_pages; + struct bio *bio; + sector_t sector = bp->b_maps[map].bm_bn; + int size; + int offset; total_nr_pages = bp->b_page_count; - map_i = 0; - if (bp->b_flags & XBF_WRITE) { - if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; - if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; - if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; - } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; - } else { - rw = READ; + /* skip the pages in the buffer before the start offset */ + page_index = 0; + offset = *buf_offset; + while (offset >= PAGE_SIZE) { + page_index++; + offset -= PAGE_SIZE; } - /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + /* + * Limit the IO size to the length of the current vector, and update the + * remaining IO count for the next time around. + */ + size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); + *count -= size; + *buf_offset += size; next_chunk: atomic_inc(&bp->b_io_remaining); @@ -1198,13 +1259,14 @@ next_chunk: bio->bi_private = bp; - for (; size && nr_pages; nr_pages--, map_i++) { + for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; if (nbytes > size) nbytes = size; - rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); + rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, + offset); if (rbytes < nbytes) break; @@ -1226,6 +1288,54 @@ next_chunk: xfs_buf_ioerror(bp, EIO); bio_put(bio); } + +} + +STATIC void +_xfs_buf_ioapply( + struct xfs_buf *bp) +{ + struct blk_plug plug; + int rw; + int offset; + int size; + int i; + + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; + } else { + rw = READ; + } + + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + + /* + * Walk all the vectors issuing IO on them. Set up the initial offset + * into the buffer and the desired IO size before we start - + * _xfs_buf_ioapply_vec() will modify them appropriately for each + * subsequent call. + */ + offset = bp->b_offset; + size = BBTOB(bp->b_io_length); + blk_start_plug(&plug); + for (i = 0; i < bp->b_map_count; i++) { + xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + if (bp->b_error) + break; + if (size <= 0) + break; /* all done */ + } + blk_finish_plug(&plug); } void diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c9c2ba90c53..67d134994ae 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -105,6 +105,9 @@ struct xfs_buf_map { int bm_len; /* size of I/O */ }; +#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ + struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -134,7 +137,9 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ - struct xfs_buf_map b_map; /* compound buffer map */ + struct xfs_buf_map *b_maps; /* compound buffer map */ + struct xfs_buf_map b_map; /* inline compound buffer map */ + int b_map_count; int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ @@ -149,11 +154,35 @@ typedef struct xfs_buf { /* Finding and Reading Buffers */ -struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags, - struct xfs_buf *new_bp); -#define xfs_incore(buftarg,blkno,len,lockit) \ - _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, struct xfs_buf *new_bp); + +static inline struct xfs_buf * +xfs_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_find(target, &map, 1, flags, NULL); +} + +struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); + +static inline struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_alloc(target, &map, 1, flags); +} struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags); @@ -163,8 +192,6 @@ void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks); struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); -- cgit v1.2.3 From 6dde27077eaf590eac279627f74b7e4e40b864b2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:10 +1000 Subject: xfs: add discontiguous buffer map interface With the internal interfaces supporting discontiguous buffer maps, add external lookup, read and get interfaces so they can start to be used. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 37 ++++++++++++++++++------------------- fs/xfs/xfs_buf.h | 46 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 58 insertions(+), 25 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 82bb8123ab2..39c5d7622de 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -581,22 +581,21 @@ found: * more hits than misses. */ struct xfs_buf * -xfs_buf_get( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks, +xfs_buf_get_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - bp = _xfs_buf_find(target, &map, 1, flags, NULL); + bp = _xfs_buf_find(target, map, nmaps, flags, NULL); if (likely(bp)) goto found; - new_bp = _xfs_buf_alloc(target, &map, 1, flags); + new_bp = _xfs_buf_alloc(target, map, nmaps, flags); if (unlikely(!new_bp)) return NULL; @@ -606,7 +605,7 @@ xfs_buf_get( return NULL; } - bp = _xfs_buf_find(target, &map, 1, flags, new_bp); + bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); if (!bp) { xfs_buf_free(new_bp); return NULL; @@ -649,17 +648,17 @@ _xfs_buf_read( } xfs_buf_t * -xfs_buf_read( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks, +xfs_buf_read_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, xfs_buf_flags_t flags) { - xfs_buf_t *bp; + struct xfs_buf *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, blkno, numblks, flags); + bp = xfs_buf_get_map(target, map, nmaps, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -687,15 +686,15 @@ xfs_buf_read( * safe manner. */ void -xfs_buf_readahead( - xfs_buftarg_t *target, - xfs_daddr_t blkno, - size_t numblks) +xfs_buf_readahead_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, blkno, numblks, + xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 67d134994ae..aa96bd410ae 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -184,12 +184,46 @@ xfs_buf_alloc( return _xfs_buf_alloc(target, &map, 1, flags); } -struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, xfs_buf_flags_t flags); -void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks); +struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +void xfs_buf_readahead_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps); + +static inline struct xfs_buf * +xfs_buf_get( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_get_map(target, &map, 1, flags); +} + +static inline struct xfs_buf * +xfs_buf_read( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_read_map(target, &map, 1, flags); +} + +static inline void +xfs_buf_readahead( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_readahead_map(target, &map, 1); +} struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); -- cgit v1.2.3 From de2a4f59190303ff5b82ead2969968a325e61230 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:11 +1000 Subject: xfs: add discontiguous buffer support to transactions Now that the buffer cache supports discontiguous buffers, add support to the transaction buffer interface for getting and reading buffers. Note that this patch does not convert the buffer item logging to support discontiguous buffers. That will be done as a separate commit. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_trans.h | 50 +++++++++++++++++++++++++++++++++---- fs/xfs/xfs_trans_buf.c | 68 +++++++++++++++++++++++++------------------------- 2 files changed, 79 insertions(+), 39 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7c37b533aa8..bc2afd52a0b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -448,11 +448,51 @@ xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); -struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, - int, uint); -int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, - struct xfs_buftarg *, xfs_daddr_t, int, uint, - struct xfs_buf **); + +struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + uint flags); + +static inline struct xfs_buf * +xfs_trans_get_buf( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + uint flags) +{ + struct xfs_buf_map map = { + .bm_bn = blkno, + .bm_len = numblks, + }; + return xfs_trans_get_buf_map(tp, target, &map, 1, flags); +} + +int xfs_trans_read_buf_map(struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp); + +static inline int +xfs_trans_read_buf( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + struct xfs_buf_map map = { + .bm_bn = blkno, + .bm_len = numblks, + }; + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); +} + struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 21c5a5e3700..6311b99c267 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -41,20 +41,26 @@ STATIC struct xfs_buf * xfs_trans_buf_item_match( struct xfs_trans *tp, struct xfs_buftarg *target, - xfs_daddr_t blkno, - int len) + struct xfs_buf_map *map, + int nmaps) { struct xfs_log_item_desc *lidp; struct xfs_buf_log_item *blip; + int len = 0; + int i; + + for (i = 0; i < nmaps; i++) + len += map[i].bm_len; - len = BBTOB(len); list_for_each_entry(lidp, &tp->t_items, lid_trans) { blip = (struct xfs_buf_log_item *)lidp->lid_item; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && - XFS_BUF_ADDR(blip->bli_buf) == blkno && - BBTOB(blip->bli_buf->b_length) == len) + XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + blip->bli_buf->b_length == len) { + ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; + } } return NULL; @@ -128,21 +134,19 @@ xfs_trans_bjoin( * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ -xfs_buf_t * -xfs_trans_get_buf(xfs_trans_t *tp, - xfs_buftarg_t *target_dev, - xfs_daddr_t blkno, - int len, - uint flags) +struct xfs_buf * +xfs_trans_get_buf_map( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; - /* - * Default to a normal get_buf() call if the tp is NULL. - */ - if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, flags); + if (!tp) + return xfs_buf_get_map(target, map, nmaps, flags); /* * If we find the buffer in the cache with this transaction @@ -150,7 +154,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { @@ -167,7 +171,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return (bp); } - bp = xfs_buf_get(target_dev, blkno, len, flags); + bp = xfs_buf_get_map(target, map, nmaps, flags); if (bp == NULL) { return NULL; } @@ -246,26 +250,22 @@ int xfs_error_mod = 33; * read_buf() call. */ int -xfs_trans_read_buf( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) +xfs_trans_read_buf_map( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; int error; *bpp = NULL; - - /* - * Default to a normal get_buf() call if the tp is NULL. - */ - if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags); + if (!tp) { + bp = xfs_buf_read_map(target, map, nmaps, flags); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -303,7 +303,7 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - bp = xfs_trans_buf_item_match(tp, target, blkno, len); + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); @@ -349,7 +349,7 @@ xfs_trans_read_buf( return 0; } - bp = xfs_buf_read(target, blkno, len, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? -- cgit v1.2.3 From 372cc85ec6820c91b4eeff303880f25cb5a00ab5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:12 +1000 Subject: xfs: support discontiguous buffers in the xfs_buf_log_item discontigous buffer in separate buffer format structures. This means log recovery will recover all the changes on a per segment basis without requiring any knowledge of the fact that it was logged from a compound buffer. To do this, we need to be able to determine what buffer segment any given offset into the compound buffer sits over. This enables us to translate the dirty bitmap in the number of separate buffer format structures required. We also need to be able to determine the number of bitmap elements that a given buffer segment has, as this determines the size of the buffer format structure. Hence we need to be able to determine the both the start offset into the buffer and the length of a given segment to be able to calculate this. With this information, we can preallocate, build and format the correct log vector array for each segment in a compound buffer to appear exactly the same as individually logged buffers in the log. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 335 ++++++++++++++++++++++++++++++++++++-------------- fs/xfs/xfs_buf_item.h | 2 + 2 files changed, 244 insertions(+), 93 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 52cd8f89ee7..e4a6e4b6fa0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -153,33 +153,25 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); * If the XFS_BLI_STALE flag has been set, then log nothing. */ STATIC uint -xfs_buf_item_size( - struct xfs_log_item *lip) +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp) { - struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; uint nvecs; int next_bit; int last_bit; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - if (bip->bli_flags & XFS_BLI_STALE) { - /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. - */ - trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - return 1; - } + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return 0; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + nvecs = 2; - ASSERT(bip->bli_flags & XFS_BLI_LOGGED); - nvecs = 1; - last_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(last_bit != -1); - nvecs++; while (last_bit != -1) { /* * This takes the bit number to start looking from and @@ -187,16 +179,15 @@ xfs_buf_item_size( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); /* * If we run out of bits, leave the loop, * else if we find a new set of bits bump the number of vecs, * else keep scanning the current set of bits. */ if (next_bit == -1) { - last_bit = -1; + break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; @@ -210,22 +201,73 @@ xfs_buf_item_size( } } - trace_xfs_buf_item_size(bip); return nvecs; } /* - * This is called to fill in the vector of log iovecs for the - * given log buf item. It fills the first entry with a buf log - * format structure, and the rest point to contiguous chunks - * within the buffer. + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. */ -STATIC void -xfs_buf_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +STATIC uint +xfs_buf_item_size( + struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); + uint nvecs; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + return bip->bli_format_count; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + nvecs = 0; + for (i = 0; i < bip->bli_format_count; i++) { + nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); + } + + trace_xfs_buf_item_size(bip); + return nvecs; +} + +static struct xfs_log_iovec * +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_iovec *vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ struct xfs_buf *bp = bip->bli_buf; uint base_size; uint nvecs; @@ -235,9 +277,8 @@ xfs_buf_item_format( uint nbits; uint buffer_offset; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || - (bip->bli_flags & XFS_BLI_STALE)); + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->bli_format.blf_flags; /* * Base size is the actual size of the ondisk structure - it reflects @@ -245,28 +286,13 @@ xfs_buf_item_format( * memory structure. */ base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + - (bip->bli_format.blf_map_size * - sizeof(bip->bli_format.blf_data_map[0])); - vecp->i_addr = &bip->bli_format; + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + vecp->i_addr = blfp; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; nvecs = 1; - /* - * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer - * this state if the inode buffer allocation has not yet been committed - * to the log as setting the XFS_BLI_INODE_BUF flag will prevent - * correct replay of the inode allocation. - */ - if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; - bip->bli_flags &= ~XFS_BLI_INODE_BUF; - } - if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -274,16 +300,15 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - bip->bli_format.blf_size = nvecs; - return; + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); + blfp->blf_size = nvecs; + return vecp; } /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); ASSERT(first_bit != -1); last_bit = first_bit; nbits = 1; @@ -294,9 +319,8 @@ xfs_buf_item_format( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - (uint)last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); /* * If we run out of bits fill in the last iovec and get * out of the loop. @@ -307,14 +331,14 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; @@ -323,14 +347,17 @@ xfs_buf_item_format( first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + } else if (xfs_buf_offset(bp, offset + + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + + (last_bit << XFS_BLF_SHIFT)) + XFS_BLF_CHUNK)) { - buffer_offset = first_bit * XFS_BLF_CHUNK; + buffer_offset = offset + first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* You would think we need to bump the nvecs here too, but we do not +/* + * You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here * nvecs++; @@ -345,6 +372,48 @@ xfs_buf_item_format( } } bip->bli_format.blf_size = nvecs; + return vecp; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + for (i = 0; i < bip->bli_format_count; i++) { + vecp = xfs_buf_item_format_segment(bip, vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } /* * Check to make sure everything is consistent. @@ -620,6 +689,35 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_committing = xfs_buf_item_committing }; +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->bli_format; + return 0; + } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return ENOMEM; + return 0; +} + +STATIC void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } +} /* * Allocate a new buf log item to go with the given buffer. @@ -637,6 +735,8 @@ xfs_buf_item_init( xfs_buf_log_item_t *bip; int chunks; int map_size; + int error; + int i; /* * Check to see if there is already a buf log item for @@ -648,25 +748,33 @@ xfs_buf_item_init( if (lip != NULL && lip->li_type == XFS_LI_BUF) return; - /* - * chunks is the number of XFS_BLF_CHUNK size pieces - * the buffer can be divided into. Make sure not to - * truncate any pieces. map_size is the size of the - * bitmap needed to describe the chunks of the buffer. - */ - chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> - XFS_BLF_SHIFT); - map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); - - bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, - KM_SLEEP); + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); - bip->bli_format.blf_type = XFS_LI_BUF; - bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)bp->b_length; - bip->bli_format.blf_map_size = map_size; + + /* + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. + */ + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } #ifdef XFS_TRANS_DEBUG /* @@ -697,10 +805,11 @@ xfs_buf_item_init( * item's bitmap. */ void -xfs_buf_item_log( - xfs_buf_log_item_t *bip, +xfs_buf_item_log_segment( + struct xfs_buf_log_item *bip, uint first, - uint last) + uint last, + uint *map) { uint first_bit; uint last_bit; @@ -712,12 +821,6 @@ xfs_buf_item_log( uint end_bit; uint mask; - /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - /* * Convert byte offsets to bit numbers. */ @@ -734,7 +837,7 @@ xfs_buf_item_log( * to set a bit in. */ word_num = first_bit >> BIT_TO_WORD_SHIFT; - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &map[word_num]; /* * Calculate the starting bit in the first word. @@ -781,6 +884,51 @@ xfs_buf_item_log( xfs_buf_item_log_debug(bip, first, last); } +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * Mark the item as having some dirty data for + * quick reference in xfs_buf_item_dirty. + */ + bip->bli_flags |= XFS_BLI_DIRTY; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; + + xfs_buf_item_log_segment(bip, first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } +} + /* * Return 1 if the buffer has some data that has been logged (at any @@ -802,6 +950,7 @@ xfs_buf_item_free( kmem_free(bip->bli_logged); #endif /* XFS_TRANS_DEBUG */ + xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index ff268678023..6850f49f4af 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -102,6 +102,8 @@ typedef struct xfs_buf_log_item { char *bli_orig; /* original buffer copy */ char *bli_logged; /* bytes logged (bitmap) */ #endif + int bli_format_count; /* count of headers */ + struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ struct xfs_buf_log_format bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; -- cgit v1.2.3 From 3605431fb9739a30ccd0c6380ae8e3c6f8e670a5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:13 +1000 Subject: xfs: use discontiguous xfs_buf support in dabuf wrappers First step in converting the directory code to use native discontiguous buffers and replacing the dabuf construct. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_da_btree.c | 538 ++++++++++++++++++++++---------------------------- fs/xfs/xfs_da_btree.h | 6 +- 2 files changed, 239 insertions(+), 305 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 015b946c580..76e5dbaa95e 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -85,7 +85,7 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, */ STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); -STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps); +STATIC xfs_dabuf_t *xfs_da_buf_make(xfs_buf_t *bp); STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); @@ -1967,35 +1967,75 @@ xfs_da_map_covers_blocks( } /* - * Make a dabuf. - * Used for get_buf, read_buf, read_bufr, and reada_buf. + * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. + * + * For the single map case, it is assumed that the caller has provided a pointer + * to a valid xfs_buf_map. For the multiple map case, this function will + * allocate the xfs_buf_map to hold all the maps and replace the caller's single + * map pointer with the allocated map. */ -STATIC int -xfs_da_do_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t *mappedbnop, - xfs_dabuf_t **bpp, - int whichfork, - int caller) +static int +xfs_buf_map_from_irec( + struct xfs_mount *mp, + struct xfs_buf_map **mapp, + unsigned int *nmaps, + struct xfs_bmbt_irec *irecs, + unsigned int nirecs) { - xfs_buf_t *bp = NULL; - xfs_buf_t **bplist; - int error=0; - int i; - xfs_bmbt_irec_t map; - xfs_bmbt_irec_t *mapp; - xfs_daddr_t mappedbno; - xfs_mount_t *mp; - int nbplist=0; - int nfsb; - int nmap; - xfs_dabuf_t *rbp; + struct xfs_buf_map *map; + int i; + + ASSERT(*nmaps == 1); + ASSERT(nirecs >= 1); + + if (nirecs > 1) { + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + if (!map) + return ENOMEM; + *mapp = map; + } + + *nmaps = nirecs; + map = *mapp; + for (i = 0; i < *nmaps; i++) { + ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && + irecs[i].br_startblock != HOLESTARTBLOCK); + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); + map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + } + return 0; +} + +/* + * Map the block we are given ready for reading. There are three possible return + * values: + * -1 - will be returned if we land in a hole and mappedbno == -2 so the + * caller knows not to execute a subsequent read. + * 0 - if we mapped the block successfully + * >0 - positive error number if there was an error. + */ +static int +xfs_dabuf_map( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + struct xfs_buf_map **map, + int *nmaps) +{ + struct xfs_mount *mp = dp->i_mount; + int nfsb; + int error = 0; + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec *irecs = &irec; + int nirecs; + + ASSERT(map && *map); + ASSERT(*nmaps == 1); - mp = dp->i_mount; nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; - mappedbno = *mappedbnop; + /* * Caller doesn't have a mapping. -2 means don't complain * if we land in a hole. @@ -2004,112 +2044,152 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) - mapp = ↦ - else - mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); + if (nfsb != 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); - nmap = nfsb; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp, - &nmap, xfs_bmapi_aflag(whichfork)); + nirecs = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, + &nirecs, xfs_bmapi_aflag(whichfork)); if (error) - goto exit0; + goto out; } else { - map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = nfsb; - mapp = ↦ - nmap = 1; + irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + irecs->br_startoff = (xfs_fileoff_t)bno; + irecs->br_blockcount = nfsb; + irecs->br_state = 0; + nirecs = 1; } - if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { - error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + + if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { + error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED); if (unlikely(error == EFSCORRUPTED)) { if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + int i; xfs_alert(mp, "%s: bno %lld dir: inode %lld", __func__, (long long)bno, (long long)dp->i_ino); - for (i = 0; i < nmap; i++) { + for (i = 0; i < *nmaps; i++) { xfs_alert(mp, "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", i, - (long long)mapp[i].br_startoff, - (long long)mapp[i].br_startblock, - (long long)mapp[i].br_blockcount, - mapp[i].br_state); + (long long)irecs[i].br_startoff, + (long long)irecs[i].br_startblock, + (long long)irecs[i].br_blockcount, + irecs[i].br_state); } } XFS_ERROR_REPORT("xfs_da_do_buf(1)", XFS_ERRLEVEL_LOW, mp); } - goto exit0; + goto out; } - if (caller != 3 && nmap > 1) { - bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); - nbplist = 0; - } else - bplist = NULL; - /* - * Turn the mapping(s) into buffer(s). - */ - for (i = 0; i < nmap; i++) { - int nmapped; - - mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); - if (i == 0) - *mappedbnop = mappedbno; - nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); - switch (caller) { - case 0: - bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, - mappedbno, nmapped, 0); - error = bp ? bp->b_error : XFS_ERROR(EIO); - break; - case 1: - case 2: - bp = NULL; - error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, - mappedbno, nmapped, 0, &bp); - break; - case 3: - xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped); + error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); +out: + if (irecs != &irec) + kmem_free(irecs); + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) error = 0; - bp = NULL; - break; - } - if (error) { - if (bp) - xfs_trans_brelse(trans, bp); - goto exit1; - } - if (!bp) - continue; - if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) - xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); - else - xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - } - if (bplist) { - bplist[nbplist++] = bp; - } + goto out_free; } - /* - * Build a dabuf structure. - */ - if (bplist) { - rbp = xfs_da_buf_make(nbplist, bplist); - } else if (bp) - rbp = xfs_da_buf_make(1, &bp); + + bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, + mapp, nmap, 0); + error = bp ? bp->b_error : XFS_ERROR(EIO); + if (error) { + xfs_trans_brelse(trans, bp); + goto out_free; + } + + *bpp = xfs_da_buf_make(bp); + +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + error = xfs_trans_read_buf_map(dp->i_mount, trans, + dp->i_mount->m_ddev_targp, + mapp, nmap, 0, &bp); + if (error) + goto out_free; + + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else - rbp = NULL; + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); + + *bpp = xfs_da_buf_make(bp); + /* - * For read_buf, check the magic number. + * This verification code will be moved to a CRC verification callback + * function so just leave it here unchanged until then. */ - if (caller == 1) { - xfs_dir2_data_hdr_t *hdr = rbp->data; - xfs_dir2_free_t *free = rbp->data; - xfs_da_blkinfo_t *info = rbp->data; + { + xfs_dir2_data_hdr_t *hdr = (*bpp)->data; + xfs_dir2_free_t *free = (*bpp)->data; + xfs_da_blkinfo_t *info = (*bpp)->data; uint magic, magic1; + struct xfs_mount *mp = dp->i_mount; magic = be16_to_cpu(info->magic); magic1 = be32_to_cpu(hdr->magic); @@ -2123,66 +2203,20 @@ xfs_da_do_buf( (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_); + trace_xfs_da_btree_corrupt(bp, _RET_IP_); XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", XFS_ERRLEVEL_LOW, mp, info); error = XFS_ERROR(EFSCORRUPTED); - xfs_da_brelse(trans, rbp); - nbplist = 0; - goto exit1; + xfs_da_brelse(trans, *bpp); + goto out_free; } } - if (bplist) { - kmem_free(bplist); - } - if (mapp != &map) { - kmem_free(mapp); - } - if (bpp) - *bpp = rbp; - return 0; -exit1: - if (bplist) { - for (i = 0; i < nbplist; i++) - xfs_trans_brelse(trans, bplist[i]); - kmem_free(bplist); - } -exit0: + +out_free: if (mapp != &map) kmem_free(mapp); - if (bpp) - *bpp = NULL; - return error; -} - -/* - * Get a buffer for the dir/attr block. - */ -int -xfs_da_get_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0); -} -/* - * Get a buffer for the dir/attr block, fill in the contents. - */ -int -xfs_da_read_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1); + return error; } /* @@ -2190,18 +2224,38 @@ xfs_da_read_buf( */ xfs_daddr_t xfs_da_reada_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - int whichfork) + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + int whichfork) { - xfs_daddr_t rval; + xfs_daddr_t mappedbno = -1; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } - rval = -1; - if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3)) + mappedbno = mapp[0].bm_bn; + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + +out_free: + if (mapp != &map) + kmem_free(mapp); + + if (error) return -1; - else - return rval; + return mappedbno; } kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ @@ -2261,78 +2315,25 @@ xfs_da_state_free(xfs_da_state_t *state) */ /* ARGSUSED */ STATIC xfs_dabuf_t * -xfs_da_buf_make(int nbuf, xfs_buf_t **bps) +xfs_da_buf_make(xfs_buf_t *bp) { - xfs_buf_t *bp; xfs_dabuf_t *dabuf; - int i; - int off; - if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); - else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); - dabuf->dirty = 0; - if (nbuf == 1) { - dabuf->nbuf = 1; - bp = bps[0]; - dabuf->bbcount = bp->b_length; - dabuf->data = bp->b_addr; - dabuf->bps[0] = bp; - } else { - dabuf->nbuf = nbuf; - for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { - dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += bp->b_length; - } - dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { - bp = bps[i]; - memcpy((char *)dabuf->data + off, bp->b_addr, - BBTOB(bp->b_length)); - } - } + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); + dabuf->bbcount = bp->b_length; + dabuf->data = bp->b_addr; + dabuf->bp = bp; return dabuf; } -/* - * Un-dirty a dabuf. - */ -STATIC void -xfs_da_buf_clean(xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - int i; - int off; - - if (dabuf->dirty) { - ASSERT(dabuf->nbuf > 1); - dabuf->dirty = 0; - for (i = off = 0; i < dabuf->nbuf; - i++, off += BBTOB(bp->b_length)) { - bp = dabuf->bps[i]; - memcpy(bp->b_addr, dabuf->data + off, - BBTOB(bp->b_length)); - } - } -} - /* * Release a dabuf. */ void xfs_da_buf_done(xfs_dabuf_t *dabuf) { - ASSERT(dabuf); - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->dirty) - xfs_da_buf_clean(dabuf); - if (dabuf->nbuf > 1) { - kmem_free(dabuf->data); - kmem_free(dabuf); - } else { - kmem_zone_free(xfs_dabuf_zone, dabuf); - } + ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); + kmem_zone_free(xfs_dabuf_zone, dabuf); } /* @@ -2341,41 +2342,9 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf) void xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) { - xfs_buf_t *bp; - uint f; - int i; - uint l; - int off; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == dabuf->bps[0]->b_addr); - xfs_trans_log_buf(tp, dabuf->bps[0], first, last); - return; - } - dabuf->dirty = 1; - ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { - bp = dabuf->bps[i]; - f = off; - l = f + BBTOB(bp->b_length) - 1; - if (f < first) - f = first; - if (l > last) - l = last; - if (f <= l) - xfs_trans_log_buf(tp, bp, f - off, l - off); - /* - * B_DONE is set by xfs_trans_log buf. - * If we don't set it on a new buffer (get not read) - * then if we don't put anything in the buffer it won't - * be set, and at commit it it released into the cache, - * and then a read will fail. - */ - else if (!(XFS_BUF_ISDONE(bp))) - XFS_BUF_DONE(bp); - } - ASSERT(last < off); + ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); + ASSERT(dabuf->data == dabuf->bp->b_addr); + xfs_trans_log_buf(tp, dabuf->bp, first, last); } /* @@ -2386,24 +2355,9 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) void xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) { - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } + ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); + xfs_trans_brelse(tp, dabuf->bp); xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_brelse(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist); } /* @@ -2412,24 +2366,9 @@ xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) void xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) { - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; - - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } + ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_binval(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist); + xfs_trans_binval(tp, dabuf->bp); } /* @@ -2438,7 +2377,6 @@ xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf) { - ASSERT(dabuf->nbuf); ASSERT(dabuf->data); - return XFS_BUF_ADDR(dabuf->bps[0]); + return XFS_BUF_ADDR(dabuf->bp); } diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index dbf7c074ae7..0b64c4a37af 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -141,14 +141,10 @@ typedef struct xfs_da_args { * same place as the b_addr field for the buffer, else to kmem_alloced memory. */ typedef struct xfs_dabuf { - int nbuf; /* number of buffer pointers present */ - short dirty; /* data needs to be copied back */ short bbcount; /* how large is data in bbs */ void *data; /* pointer for buffers' data */ - struct xfs_buf *bps[1]; /* actually nbuf of these */ + struct xfs_buf *bp; /* actually nbuf of these */ } xfs_dabuf_t; -#define XFS_DA_BUF_SIZE(n) \ - (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) /* * Storage for holding state during Btree searches and split/join ops. -- cgit v1.2.3 From 1d9025e56143c0c4aebebdb62e46618d3d284218 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:14 +1000 Subject: xfs: remove struct xfs_dabuf and infrastructure The struct xfs_dabuf now only tracks a single xfs_buf and all the information it holds can be gained directly from the xfs_buf. Hence we can remove the struct dabuf and pass the xfs_buf around everywhere. Kill the struct dabuf and the associated infrastructure. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 78 ++++++----- fs/xfs/xfs_attr_leaf.c | 255 +++++++++++++++++++----------------- fs/xfs/xfs_attr_leaf.h | 21 ++- fs/xfs/xfs_da_btree.c | 337 +++++++++++++++++------------------------------- fs/xfs/xfs_da_btree.h | 32 +---- fs/xfs/xfs_dir2.c | 4 +- fs/xfs/xfs_dir2_block.c | 118 ++++++++--------- fs/xfs/xfs_dir2_data.c | 50 +++---- fs/xfs/xfs_dir2_leaf.c | 191 +++++++++++++-------------- fs/xfs/xfs_dir2_node.c | 236 ++++++++++++++------------------- fs/xfs/xfs_dir2_priv.h | 46 +++---- fs/xfs/xfs_dir2_sf.c | 4 +- fs/xfs/xfs_super.c | 9 +- 13 files changed, 602 insertions(+), 779 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index a17ff01b5ad..0ca1f0be62d 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -893,7 +893,7 @@ STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; trace_xfs_attr_leaf_addname(args); @@ -915,11 +915,11 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ retval = xfs_attr_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(retval); } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(retval); } @@ -937,7 +937,6 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * if required. */ retval = xfs_attr_leaf_add(bp, args); - xfs_da_buf_done(bp); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -1065,8 +1064,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); - } else - xfs_da_buf_done(bp); + } /* * Commit the remove and start the next trans in series. @@ -1092,7 +1090,7 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, committed, forkoff; trace_xfs_attr_leaf_removename(args); @@ -1111,7 +1109,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(error); } @@ -1141,8 +1139,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ if (committed) xfs_trans_ijoin(args->trans, dp, 0); - } else - xfs_da_buf_done(bp); + } return(0); } @@ -1155,7 +1152,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) { - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; args->blkno = 0; @@ -1167,11 +1164,11 @@ xfs_attr_leaf_get(xfs_da_args_t *args) error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return(error); } error = xfs_attr_leaf_getvalue(bp, args); - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } @@ -1186,23 +1183,23 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) { xfs_attr_leafblock_t *leaf; int error; - xfs_dabuf_t *bp; + struct xfs_buf *bp; context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return XFS_ERROR(error); ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return XFS_ERROR(EFSCORRUPTED); } error = xfs_attr_leaf_list_int(bp, context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return XFS_ERROR(error); } @@ -1489,7 +1486,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_t *state; xfs_da_state_blk_t *blk; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; trace_xfs_attr_node_removename(args); @@ -1601,14 +1598,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ ASSERT(state->path.active == 1); ASSERT(state->path.blk[0].bp); - xfs_da_buf_done(state->path.blk[0].bp); state->path.blk[0].bp = NULL; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->data)->hdr.info.magic) == + ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { @@ -1635,7 +1631,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } else - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); } error = 0; @@ -1665,8 +1661,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1681,8 +1676,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1792,7 +1786,7 @@ xfs_attr_node_get(xfs_da_args_t *args) * If not in a transaction, we have to release all the buffers. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } @@ -1808,7 +1802,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_da_intnode_t *node; xfs_da_node_entry_t *btree; int error, i; - xfs_dabuf_t *bp; + struct xfs_buf *bp; cursor = context->cursor; cursor->initted = 1; @@ -1825,30 +1819,30 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { - node = bp->data; + node = bp->b_addr; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: - leaf = bp->data; + leaf = bp->b_addr; if (cursor->hashval > be32_to_cpu(leaf->entries[ be16_to_cpu(leaf->hdr.count)-1].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } else if (cursor->hashval <= be32_to_cpu(leaf->entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } break; default: trace_xfs_attr_list_wrong_blk(context); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; } } @@ -1873,7 +1867,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) context->dp->i_mount); return(XFS_ERROR(EFSCORRUPTED)); } - node = bp->data; + node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) break; @@ -1883,7 +1877,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) XFS_ERRLEVEL_LOW, context->dp->i_mount, node); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(XFS_ERROR(EFSCORRUPTED)); } btree = node->btree; @@ -1898,10 +1892,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) } } if (i == be16_to_cpu(node->hdr.count)) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(0); } - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); } } ASSERT(bp != NULL); @@ -1912,24 +1906,24 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * adding the information. */ for (;;) { - leaf = bp->data; + leaf = bp->b_addr; if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(XFS_ERROR(EFSCORRUPTED)); } error = xfs_attr_leaf_list_int(bp, context); if (error) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return error; } if (context->seen_enough || leaf->hdr.info.forw == 0) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) @@ -1941,7 +1935,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) return(XFS_ERROR(EFSCORRUPTED)); } } - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return(0); } diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 7d89d800f51..d330111ca73 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -54,10 +54,10 @@ * Routines used for growing the Btree. */ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - xfs_dabuf_t **bpp); -STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, - int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); + struct xfs_buf **bpp); +STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, + xfs_da_args_t *args, int freemap_index); +STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -71,9 +71,9 @@ STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * Routines used for shrinking the Btree. */ STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp, int level); + struct xfs_buf *bp, int level); STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp); + struct xfs_buf *bp); STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt); @@ -480,7 +480,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) char *tmpbuffer; int error, i, size; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; xfs_ifork_t *ifp; trace_xfs_attr_sf_to_leaf(args); @@ -550,8 +550,6 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) error = 0; out: - if(bp) - xfs_da_buf_done(bp); kmem_free(tmpbuffer); return(error); } @@ -737,14 +735,16 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * a shortform attribute list. */ int -xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +xfs_attr_shortform_allfit( + struct xfs_buf *bp, + struct xfs_inode *dp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_local_t *name_loc; int bytes, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); entry = &leaf->entries[0]; @@ -774,7 +774,10 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) +xfs_attr_leaf_to_shortform( + struct xfs_buf *bp, + xfs_da_args_t *args, + int forkoff) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -791,10 +794,10 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) ASSERT(tmpbuffer != NULL); ASSERT(bp != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); + memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); leaf = (xfs_attr_leafblock_t *)tmpbuffer; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); /* * Clean out the prior contents of the attribute list. @@ -855,7 +858,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_da_intnode_t *node; xfs_inode_t *dp; - xfs_dabuf_t *bp1, *bp2; + struct xfs_buf *bp1, *bp2; xfs_dablk_t blkno; int error; @@ -877,10 +880,9 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) if (error) goto out; ASSERT(bp2 != NULL); - memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); - xfs_da_buf_done(bp1); + memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; - xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); /* * Set up the new root node. @@ -888,21 +890,17 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; - node = bp1->data; - leaf = bp2->data; + node = bp1->b_addr; + leaf = bp2->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* both on-disk, don't endian-flip twice */ node->btree[0].hashval = leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; node->btree[0].before = cpu_to_be32(blkno); node->hdr.count = cpu_to_be16(1); - xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); error = 0; out: - if (bp1) - xfs_da_buf_done(bp1); - if (bp2) - xfs_da_buf_done(bp2); return(error); } @@ -916,12 +914,15 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +xfs_attr_leaf_create( + xfs_da_args_t *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_attr_leaf_create(args); @@ -933,7 +934,7 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) if (error) return(error); ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); @@ -947,7 +948,7 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - sizeof(xfs_attr_leaf_hdr_t)); - xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); *bpp = bp; return(0); @@ -1014,7 +1015,9 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1023,7 +1026,7 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) trace_xfs_attr_leaf_add(args); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(leaf->hdr.count))); @@ -1085,7 +1088,10 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +xfs_attr_leaf_add_work( + struct xfs_buf *bp, + xfs_da_args_t *args, + int mapindex) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1096,7 +1102,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) xfs_mount_t *mp; int tmp, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); @@ -1110,7 +1116,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) tmp = be16_to_cpu(hdr->count) - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); memmove((char *)(entry+1), (char *)entry, tmp); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } be16_add_cpu(&hdr->count, 1); @@ -1142,7 +1148,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) args->index2++; } } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); @@ -1174,7 +1180,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) args->rmtblkno = 1; args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); @@ -1198,7 +1204,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) } } be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); return(0); } @@ -1207,7 +1213,9 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +xfs_attr_leaf_compact( + struct xfs_trans *trans, + struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf_s, *leaf_d; xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; @@ -1217,14 +1225,14 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp)); - memset(bp->data, 0, XFS_LBSIZE(mp)); + memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); + memset(bp->b_addr, 0, XFS_LBSIZE(mp)); /* * Copy basic information */ leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->data; + leaf_d = bp->b_addr; hdr_s = &leaf_s->hdr; hdr_d = &leaf_d->hdr; hdr_d->info = hdr_s->info; /* struct copy */ @@ -1247,7 +1255,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) */ xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, be16_to_cpu(hdr_s->count), mp); - xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); kmem_free(tmpbuffer); } @@ -1279,8 +1287,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); args = state->args; @@ -1298,8 +1306,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; swap = 1; } hdr1 = &leaf1->hdr; @@ -1346,8 +1354,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, leaf2, 0, count, state->mp); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); } else if (count > be16_to_cpu(hdr1->count)) { /* * I assert that since all callers pass in an empty @@ -1378,8 +1386,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_attr_leaf_moveents(leaf2, 0, leaf1, be16_to_cpu(hdr1->count), count, state->mp); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); } /* @@ -1448,8 +1456,8 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Set up environment. */ - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; hdr1 = &leaf1->hdr; hdr2 = &leaf2->hdr; foundit = 0; @@ -1551,7 +1559,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_da_blkinfo_t *info; int count, bytes, forward, error, retval, i; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Check for the degenerate case of the block being over 50% full. @@ -1559,7 +1567,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); @@ -1622,13 +1630,13 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); bytes -= sizeof(xfs_attr_leaf_hdr_t); - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ } @@ -1666,7 +1674,9 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_remove( + struct xfs_buf *bp, + xfs_da_args_t *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_hdr_t *hdr; @@ -1676,7 +1686,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) int tablesize, tmp, i; xfs_mount_t *mp; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; mp = args->trans->t_mountp; @@ -1769,7 +1779,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) */ memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); be16_add_cpu(&hdr->usedbytes, -entsize); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), entsize)); @@ -1777,7 +1787,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) * sizeof(xfs_attr_leaf_entry_t); memmove((char *)entry, (char *)(entry+1), tmp); be16_add_cpu(&hdr->count, -1); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); entry = &leaf->entries[be16_to_cpu(hdr->count)]; memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); @@ -1807,7 +1817,7 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) } else { hdr->holes = 1; /* mark as needing compaction */ } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); /* @@ -1840,8 +1850,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, mp = state->mp; ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); drop_hdr = &drop_leaf->hdr; @@ -1906,7 +1916,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, kmem_free(tmpbuffer); } - xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->blocksize - 1); /* @@ -1934,7 +1944,9 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_lookup_int( + struct xfs_buf *bp, + xfs_da_args_t *args) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -1945,7 +1957,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) trace_xfs_attr_leaf_lookup(args); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2041,7 +2053,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) * list structure. */ int -xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr_leaf_getvalue( + struct xfs_buf *bp, + xfs_da_args_t *args) { int valuelen; xfs_attr_leafblock_t *leaf; @@ -2049,7 +2063,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8)); @@ -2247,12 +2261,14 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * Return 0 unless leaf2 should go before leaf1. */ int -xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) { xfs_attr_leafblock_t *leaf1, *leaf2; - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; + leaf1 = leaf1_bp->b_addr; + leaf2 = leaf2_bp->b_addr; ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); if ((be16_to_cpu(leaf1->hdr.count) > 0) && @@ -2272,11 +2288,13 @@ xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) * Pick up the last hashvalue from a leaf block. */ xfs_dahash_t -xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +xfs_attr_leaf_lasthash( + struct xfs_buf *bp, + int *count) { xfs_attr_leafblock_t *leaf; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); @@ -2337,7 +2355,9 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ int -xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) +xfs_attr_leaf_list_int( + struct xfs_buf *bp, + xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; @@ -2345,7 +2365,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) int retval, i; ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; cursor = context->cursor; cursor->initted = 1; @@ -2463,7 +2483,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; #ifdef DEBUG xfs_attr_leaf_name_local_t *name_loc; @@ -2482,7 +2502,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) } ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); @@ -2505,7 +2525,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) #endif /* DEBUG */ entry->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if (args->rmtblkno) { @@ -2513,10 +2533,9 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. @@ -2533,7 +2552,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_attr_leaf_setflag(args); @@ -2548,7 +2567,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) } ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); @@ -2556,16 +2575,15 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. @@ -2586,7 +2604,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_attr_leafblock_t *leaf1, *leaf2; xfs_attr_leaf_entry_t *entry1, *entry2; xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp1, *bp2; + struct xfs_buf *bp1, *bp2; int error; #ifdef DEBUG xfs_attr_leaf_name_local_t *name_loc; @@ -2620,13 +2638,13 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) bp2 = bp1; } - leaf1 = bp1->data; + leaf1 = bp1->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; - leaf2 = bp2->data; + leaf2 = bp2->b_addr; ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); @@ -2660,30 +2678,27 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); entry1->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } entry2->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp1); - if (bp1 != bp2) - xfs_da_buf_done(bp2); /* * Commit the flag value change and start the next trans in series. @@ -2706,7 +2721,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) { xfs_da_blkinfo_t *info; xfs_daddr_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; /* @@ -2718,20 +2733,20 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); - blkno = xfs_da_blkno(bp); + blkno = XFS_BUF_ADDR(bp); /* * Invalidate the tree, even if the "tree" is only a single leaf block. * This is a depth-first traversal! */ - info = bp->data; + info = bp->b_addr; if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, bp, 1); } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { error = xfs_attr_leaf_inactive(trans, dp, bp); } else { error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); } if (error) return(error); @@ -2742,7 +2757,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); if (error) return(error); - xfs_da_binval(*trans, bp); /* remove from cache */ + xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. */ @@ -2756,34 +2771,37 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * We're doing a depth-first traversal in order to invalidate everything. */ STATIC int -xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, - int level) +xfs_attr_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) { xfs_da_blkinfo_t *info; xfs_da_intnode_t *node; xfs_dablk_t child_fsb; xfs_daddr_t parent_blkno, child_blkno; int error, count, i; - xfs_dabuf_t *child_bp; + struct xfs_buf *child_bp; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { - xfs_da_brelse(*trans, bp); /* no locks for later trans */ + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ return(XFS_ERROR(EIO)); } - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ + parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ count = be16_to_cpu(node->hdr.count); if (!count) { - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); return(0); } child_fsb = be32_to_cpu(node->btree[0].before); - xfs_da_brelse(*trans, bp); /* no locks for later trans */ + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* * If this is the node level just above the leaves, simply loop @@ -2803,12 +2821,12 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, return(error); if (child_bp) { /* save for re-read later */ - child_blkno = xfs_da_blkno(child_bp); + child_blkno = XFS_BUF_ADDR(child_bp); /* * Invalidate the subtree, however we have to. */ - info = child_bp->data; + info = child_bp->b_addr; if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { error = xfs_attr_node_inactive(trans, dp, child_bp, level+1); @@ -2817,7 +2835,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, child_bp); } else { error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, child_bp); + xfs_trans_brelse(*trans, child_bp); } if (error) return(error); @@ -2830,7 +2848,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, &child_bp, XFS_ATTR_FORK); if (error) return(error); - xfs_da_binval(*trans, child_bp); + xfs_trans_binval(*trans, child_bp); } /* @@ -2843,7 +2861,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); } /* * Atomically commit the whole invalidate stuff. @@ -2863,7 +2881,10 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * caught holding something that the logging code wants to flush to disk. */ STATIC int -xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) +xfs_attr_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; @@ -2871,7 +2892,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) xfs_attr_inactive_list_t *list, *lp; int error, count, size, tmp, i; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); /* @@ -2892,7 +2913,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) * If there are no "remote" values, we're done. */ if (count == 0) { - xfs_da_brelse(*trans, bp); + xfs_trans_brelse(*trans, bp); return(0); } @@ -2919,7 +2940,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) } } } - xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ /* * Invalidate each of the "remote" value extents. diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 9c7d22fdcf4..dea17722945 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -31,7 +31,6 @@ struct attrlist; struct attrlist_cursor_kern; struct xfs_attr_list_context; -struct xfs_dabuf; struct xfs_da_args; struct xfs_da_state; struct xfs_da_state_blk; @@ -215,7 +214,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); -int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); @@ -223,7 +222,7 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, +int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); @@ -235,14 +234,14 @@ int xfs_attr_leaf_flipflags(xfs_da_args_t *args); int xfs_attr_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, +int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, +int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, +int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, +int xfs_attr_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* @@ -257,9 +256,9 @@ int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. */ -xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); -int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 76e5dbaa95e..7bfb7dd334f 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -83,9 +83,9 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); -STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); -STATIC xfs_dabuf_t *xfs_da_buf_make(xfs_buf_t *bp); +STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); +STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp); STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); @@ -100,10 +100,10 @@ STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); */ int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork) + struct xfs_buf **bpp, int whichfork) { xfs_da_intnode_t *node; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; xfs_trans_t *tp; @@ -114,7 +114,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, if (error) return(error); ASSERT(bp != NULL); - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = 0; node->hdr.info.back = 0; node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); @@ -122,7 +122,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, node->hdr.count = 0; node->hdr.level = cpu_to_be16(level); - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); *bpp = bp; @@ -138,7 +138,7 @@ xfs_da_split(xfs_da_state_t *state) { xfs_da_state_blk_t *oldblk, *newblk, *addblk; xfs_da_intnode_t *node; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int max, action, error, i; trace_xfs_da_split(state->args); @@ -203,7 +203,6 @@ xfs_da_split(xfs_da_state_t *state) case XFS_DA_NODE_MAGIC: error = xfs_da_node_split(state, oldblk, newblk, addblk, max - i, &action); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; if (error) return(error); /* GROT: dir is inconsistent */ @@ -221,13 +220,6 @@ xfs_da_split(xfs_da_state_t *state) * Update the btree to show the new hashval for this child. */ xfs_da_fixhashpath(state, &state->path); - /* - * If we won't need this block again, it's getting dropped - * from the active path by the loop control, so we need - * to mark it done now. - */ - if (i > 0 || !addblk) - xfs_da_buf_done(oldblk->bp); } if (!addblk) return(0); @@ -239,8 +231,6 @@ xfs_da_split(xfs_da_state_t *state) oldblk = &state->path.blk[0]; error = xfs_da_root_split(state, oldblk, addblk); if (error) { - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ } @@ -252,7 +242,7 @@ xfs_da_split(xfs_da_state_t *state) * and the original block 0 could be at any position in the list. */ - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { bp = addblk->bp; @@ -260,13 +250,13 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { bp = addblk->bp; @@ -274,14 +264,12 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(0); } @@ -298,7 +286,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_intnode_t *node, *oldroot; xfs_da_args_t *args; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, size; xfs_inode_t *dp; xfs_trans_t *tp; @@ -323,8 +311,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, if (error) return(error); ASSERT(bp != NULL); - node = bp->data; - oldroot = blk1->bp->data; + node = bp->b_addr; + oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - (char *)oldroot); @@ -335,8 +323,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, (char *)leaf); } memcpy(node, oldroot, size); - xfs_da_log_buf(tp, bp, 0, size - 1); - xfs_da_buf_done(blk1->bp); + xfs_trans_log_buf(tp, bp, 0, size - 1); blk1->bp = bp; blk1->blkno = blkno; @@ -348,7 +335,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); if (error) return(error); - node = bp->data; + node = bp->b_addr; node->btree[0].hashval = cpu_to_be32(blk1->hashval); node->btree[0].before = cpu_to_be32(blk1->blkno); node->btree[1].hashval = cpu_to_be32(blk2->hashval); @@ -365,10 +352,9 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, #endif /* Header is already logged by xfs_da_node_create */ - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, node->btree, sizeof(xfs_da_node_entry_t) * 2)); - xfs_da_buf_done(bp); return(0); } @@ -389,7 +375,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, trace_xfs_da_node_split(state->args); - node = oldblk->bp->data; + node = oldblk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); /* @@ -436,7 +422,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * If we had double-split op below us, then add the extra block too. */ - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (oldblk->index <= be16_to_cpu(node->hdr.count)) { oldblk->index++; xfs_da_node_add(state, oldblk, addblk); @@ -477,8 +463,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, trace_xfs_da_node_rebalance(state->args); - node1 = blk1->bp->data; - node2 = blk2->bp->data; + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. @@ -532,7 +518,7 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; memcpy(btree_d, btree_s, tmp); be16_add_cpu(&node1->hdr.count, count); - xfs_da_log_buf(tp, blk1->bp, + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* @@ -549,9 +535,9 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Log header of node 1 and all current bits of node 2. */ - xfs_da_log_buf(tp, blk1->bp, + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); - xfs_da_log_buf(tp, blk2->bp, + xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, sizeof(node2->hdr) + sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); @@ -560,8 +546,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->data; - node2 = blk2->bp->data; + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); @@ -587,7 +573,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, trace_xfs_da_node_add(state->args); - node = oldblk->bp->data; + node = oldblk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); ASSERT(newblk->blkno != 0); @@ -606,10 +592,10 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, } btree->hashval = cpu_to_be32(newblk->hashval); btree->before = cpu_to_be32(newblk->blkno); - xfs_da_log_buf(state->args->trans, oldblk->bp, + xfs_trans_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); be16_add_cpu(&node->hdr.count, 1); - xfs_da_log_buf(state->args->trans, oldblk->bp, + xfs_trans_log_buf(state->args->trans, oldblk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); /* @@ -735,7 +721,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_intnode_t *oldroot; xfs_da_args_t *args; xfs_dablk_t child; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; trace_xfs_da_root_join(state->args); @@ -743,7 +729,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) args = state->args; ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); - oldroot = root_blk->bp->data; + oldroot = root_blk->bp->b_addr; ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(!oldroot->hdr.info.forw); ASSERT(!oldroot->hdr.info.back); @@ -765,11 +751,11 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) if (error) return(error); ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->data, + xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); - memcpy(root_blk->bp->data, bp->data, state->blocksize); - xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -791,7 +777,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_da_blkinfo_t *info; int count, forward, error, retval, i; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Check for the degenerate case of the block being over 50% full. @@ -799,7 +785,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; count = be16_to_cpu(node->hdr.count); @@ -859,10 +845,10 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) count = state->node_ents; count -= state->node_ents >> 2; count -= be16_to_cpu(node->hdr.count); - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); count -= be16_to_cpu(node->hdr.count); - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); if (count >= 0) break; /* fits with at least 25% to spare */ } @@ -934,14 +920,14 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) break; } for (blk--, level--; level >= 0; blk--, level--) { - node = blk->bp->data; + node = blk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); btree = &node->btree[ blk->index ]; if (be32_to_cpu(btree->hashval) == lasthash) break; blk->hashval = lasthash; btree->hashval = cpu_to_be32(lasthash); - xfs_da_log_buf(state->args->trans, blk->bp, + xfs_trans_log_buf(state->args->trans, blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); @@ -960,7 +946,7 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) trace_xfs_da_node_remove(state->args); - node = drop_blk->bp->data; + node = drop_blk->bp->b_addr; ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); ASSERT(drop_blk->index >= 0); @@ -972,15 +958,15 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); memmove(btree, btree + 1, tmp); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, tmp)); btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; } memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); be16_add_cpu(&node->hdr.count, -1); - xfs_da_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(state->args->trans, drop_blk->bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); /* @@ -1005,8 +991,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_node_unbalance(state->args); - drop_node = drop_blk->bp->data; - save_node = save_blk->bp->data; + drop_node = drop_blk->bp->b_addr; + save_node = save_blk->bp->b_addr; ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); tp = state->args->trans; @@ -1023,13 +1009,13 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); memmove(btree, &save_node->btree[0], tmp); btree = &save_node->btree[0]; - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, btree, (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * sizeof(xfs_da_node_entry_t))); } else { btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, btree, be16_to_cpu(drop_node->hdr.count) * sizeof(xfs_da_node_entry_t))); @@ -1042,7 +1028,7 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, memcpy(btree, &drop_node->btree[0], tmp); be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); - xfs_da_log_buf(tp, save_blk->bp, + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, sizeof(save_node->hdr))); @@ -1100,7 +1086,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) state->path.active--; return(error); } - curr = blk->bp->data; + curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); ASSERT(blk->magic == XFS_DA_NODE_MAGIC || blk->magic == XFS_DIR2_LEAFN_MAGIC || @@ -1110,7 +1096,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Search an intermediate node for a match. */ if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->data; + node = blk->bp->b_addr; max = be16_to_cpu(node->hdr.count); blk->hashval = be32_to_cpu(node->btree[max-1].hashval); @@ -1216,15 +1202,15 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; xfs_da_args_t *args; int before=0, error; - xfs_dabuf_t *bp; + struct xfs_buf *bp; /* * Set up environment. */ args = state->args; ASSERT(args != NULL); - old_info = old_blk->bp->data; - new_info = new_blk->bp->data; + old_info = old_blk->bp->b_addr; + new_info = new_blk->bp->b_addr; ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1261,12 +1247,11 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->back = cpu_to_be32(new_blk->blkno); } else { @@ -1283,18 +1268,17 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); tmp_info->back = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->forw = cpu_to_be32(new_blk->blkno); } - xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); - xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); return(0); } @@ -1302,12 +1286,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, * Compare two intermediate nodes for "order". */ STATIC int -xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) +xfs_da_node_order( + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) { xfs_da_intnode_t *node1, *node2; - node1 = node1_bp->data; - node2 = node2_bp->data; + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && @@ -1324,11 +1310,13 @@ xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) * Pick up the last hashvalue from an intermediate node. */ STATIC uint -xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) +xfs_da_node_lasthash( + struct xfs_buf *bp, + int *count) { xfs_da_intnode_t *node; - node = bp->data; + node = bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (count) *count = be16_to_cpu(node->hdr.count); @@ -1346,7 +1334,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, { xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; xfs_da_args_t *args; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; /* @@ -1354,8 +1342,8 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, */ args = state->args; ASSERT(args != NULL); - save_info = save_blk->bp->data; - drop_info = drop_blk->bp->data; + save_info = save_blk->bp->b_addr; + drop_info = drop_blk->bp->b_addr; ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -1380,13 +1368,12 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); tmp_info->forw = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } else { trace_xfs_da_unlink_forward(args); @@ -1398,17 +1385,16 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); tmp_info->back = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } - xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); return(0); } @@ -1443,7 +1429,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { ASSERT(blk->bp != NULL); - node = blk->bp->data; + node = blk->bp->b_addr; ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { blk->index++; @@ -1471,7 +1457,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * (if it's dirty, trans won't actually let go) */ if (release) - xfs_da_brelse(args->trans, blk->bp); + xfs_trans_brelse(args->trans, blk->bp); /* * Read the next child block. @@ -1482,7 +1468,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, if (error) return(error); ASSERT(blk->bp != NULL); - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); @@ -1702,11 +1688,13 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, - xfs_dabuf_t **dead_bufp) +xfs_da_swap_lastblock( + xfs_da_args_t *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; + struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; xfs_fileoff_t lastoff; xfs_inode_t *ip; xfs_trans_t *tp; @@ -1744,9 +1732,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize); - xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); - dead_info = dead_buf->data; + memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize); + xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + dead_info = dead_buf->b_addr; /* * Get values from the moved block. */ @@ -1767,7 +1755,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if ((sib_blkno = be32_to_cpu(dead_info->back))) { if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1777,10 +1765,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->forw = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->forw, sizeof(sib_info->forw))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } /* @@ -1789,7 +1776,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if ((sib_blkno = be32_to_cpu(dead_info->forw))) { if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1799,10 +1786,9 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->back = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->back, sizeof(sib_info->back))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } par_blkno = mp->m_dirleafblk; @@ -1813,7 +1799,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, for (;;) { if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) goto done; - par_node = par_buf->data; + par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC) || (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { @@ -1837,7 +1823,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, par_blkno = be32_to_cpu(par_node->btree[entno].before); if (level == dead_level + 1) break; - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; } /* @@ -1853,7 +1839,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, if (entno < be16_to_cpu(par_node->hdr.count)) break; par_blkno = be32_to_cpu(par_node->hdr.info.forw); - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", @@ -1863,7 +1849,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, } if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) goto done; - par_node = par_buf->data; + par_node = par_buf->b_addr; if (unlikely( be16_to_cpu(par_node->hdr.level) != level || par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { @@ -1878,20 +1864,18 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, * Update the parent entry pointing to the moved block. */ par_node->btree[entno].before = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, par_buf, + xfs_trans_log_buf(tp, par_buf, XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, sizeof(par_node->btree[entno].before))); - xfs_da_buf_done(par_buf); - xfs_da_buf_done(dead_buf); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; done: if (par_buf) - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); if (sib_buf) - xfs_da_brelse(tp, sib_buf); - xfs_da_brelse(tp, last_buf); + xfs_trans_brelse(tp, sib_buf); + xfs_trans_brelse(tp, last_buf); return error; } @@ -1899,8 +1883,10 @@ done: * Remove a btree block from a directory or attribute. */ int -xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf) +xfs_da_shrink_inode( + xfs_da_args_t *args, + xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf) { xfs_inode_t *dp; int done, error, w, count; @@ -1935,7 +1921,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, break; } } - xfs_da_binval(tp, dead_buf); + xfs_trans_binval(tp, dead_buf); return error; } @@ -2099,7 +2085,7 @@ xfs_da_get_buf( struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, + struct xfs_buf **bpp, int whichfork) { struct xfs_buf *bp; @@ -2128,7 +2114,7 @@ xfs_da_get_buf( goto out_free; } - *bpp = xfs_da_buf_make(bp); + *bpp = bp; out_free: if (mapp != &map) @@ -2146,7 +2132,7 @@ xfs_da_read_buf( struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, + struct xfs_buf **bpp, int whichfork) { struct xfs_buf *bp; @@ -2178,16 +2164,14 @@ xfs_da_read_buf( else xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - *bpp = xfs_da_buf_make(bp); - /* * This verification code will be moved to a CRC verification callback * function so just leave it here unchanged until then. */ { - xfs_dir2_data_hdr_t *hdr = (*bpp)->data; - xfs_dir2_free_t *free = (*bpp)->data; - xfs_da_blkinfo_t *info = (*bpp)->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_free_t *free = bp->b_addr; + xfs_da_blkinfo_t *info = bp->b_addr; uint magic, magic1; struct xfs_mount *mp = dp->i_mount; @@ -2207,11 +2191,11 @@ xfs_da_read_buf( XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", XFS_ERRLEVEL_LOW, mp, info); error = XFS_ERROR(EFSCORRUPTED); - xfs_da_brelse(trans, *bpp); + xfs_trans_brelse(trans, bp); goto out_free; } } - + *bpp = bp; out_free: if (mapp != &map) kmem_free(mapp); @@ -2259,7 +2243,6 @@ out_free: } kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ -kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ /* * Allocate a dir-state structure. @@ -2279,13 +2262,8 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state) { int i; - for (i = 0; i < state->altpath.active; i++) { - if (state->altpath.blk[i].bp) { - if (state->altpath.blk[i].bp != state->path.blk[i].bp) - xfs_da_buf_done(state->altpath.blk[i].bp); - state->altpath.blk[i].bp = NULL; - } - } + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; state->altpath.active = 0; } @@ -2295,88 +2273,9 @@ xfs_da_state_kill_altpath(xfs_da_state_t *state) void xfs_da_state_free(xfs_da_state_t *state) { - int i; - xfs_da_state_kill_altpath(state); - for (i = 0; i < state->path.active; i++) { - if (state->path.blk[i].bp) - xfs_da_buf_done(state->path.blk[i].bp); - } - if (state->extravalid && state->extrablk.bp) - xfs_da_buf_done(state->extrablk.bp); #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ kmem_zone_free(xfs_da_state_zone, state); } - -/* - * Create a dabuf. - */ -/* ARGSUSED */ -STATIC xfs_dabuf_t * -xfs_da_buf_make(xfs_buf_t *bp) -{ - xfs_dabuf_t *dabuf; - - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); - dabuf->bbcount = bp->b_length; - dabuf->data = bp->b_addr; - dabuf->bp = bp; - return dabuf; -} - -/* - * Release a dabuf. - */ -void -xfs_da_buf_done(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); - kmem_zone_free(xfs_dabuf_zone, dabuf); -} - -/* - * Log transaction from a dabuf. - */ -void -xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) -{ - ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); - ASSERT(dabuf->data == dabuf->bp->b_addr); - xfs_trans_log_buf(tp, dabuf->bp, first, last); -} - -/* - * Release dabuf from a transaction. - * Have to free up the dabuf before the buffers are released, - * since the synchronization on the dabuf is really the lock on the buffer. - */ -void -xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); - xfs_trans_brelse(tp, dabuf->bp); - xfs_da_buf_done(dabuf); -} - -/* - * Invalidate dabuf from a transaction. - */ -void -xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->data && dabuf->bbcount && dabuf->bp); - xfs_da_buf_done(dabuf); - xfs_trans_binval(tp, dabuf->bp); -} - -/* - * Get the first daddr from a dabuf. - */ -xfs_daddr_t -xfs_da_blkno(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->data); - return XFS_BUF_ADDR(dabuf->bp); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 0b64c4a37af..9f37aa03eb3 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -132,20 +132,6 @@ typedef struct xfs_da_args { { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" } -/* - * Structure to describe buffer(s) for a block. - * This is needed in the directory version 2 format case, when - * multiple non-contiguous fsblocks might be needed to cover one - * logical directory block. - * If the buffer count is 1 then the data pointer points to the - * same place as the b_addr field for the buffer, else to kmem_alloced memory. - */ -typedef struct xfs_dabuf { - short bbcount; /* how large is data in bbs */ - void *data; /* pointer for buffers' data */ - struct xfs_buf *bp; /* actually nbuf of these */ -} xfs_dabuf_t; - /* * Storage for holding state during Btree searches and split/join ops. * @@ -154,7 +140,7 @@ typedef struct xfs_dabuf { * which is slightly more than enough. */ typedef struct xfs_da_state_blk { - xfs_dabuf_t *bp; /* buffer containing block */ + struct xfs_buf *bp; /* buffer containing block */ xfs_dablk_t blkno; /* filesystem blkno of buffer */ xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ int index; /* relevant index into block */ @@ -207,7 +193,7 @@ struct xfs_nameops { * Routines used for growing the Btree. */ int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork); int xfs_da_split(xfs_da_state_t *state); /* @@ -237,14 +223,14 @@ int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bp, int whichfork); + struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, int whichfork); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf); + struct xfs_buf *dead_buf); uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, @@ -254,15 +240,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); -void xfs_da_buf_done(xfs_dabuf_t *dabuf); -void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, - uint last); -void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); - extern struct kmem_zone *xfs_da_state_zone; -extern struct kmem_zone *xfs_dabuf_zone; extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 67a250c36d4..b26a50f9921 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -592,7 +592,7 @@ int xfs_dir2_shrink_inode( xfs_da_args_t *args, xfs_dir2_db_t db, - xfs_dabuf_t *bp) + struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ @@ -634,7 +634,7 @@ xfs_dir2_shrink_inode( /* * Invalidate the buffer from the transaction. */ - xfs_da_binval(tp, bp); + xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 586732f2d80..e93ca8f054f 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -37,10 +37,10 @@ /* * Local function prototypes. */ -static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, - int last); -static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, + int first, int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, int *entno); static int xfs_dir2_block_sort(const void *a, const void *b); @@ -66,7 +66,7 @@ xfs_dir2_block_addname( xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ int compact; /* need to compact leaf ents */ xfs_dir2_data_entry_t *dep; /* block data entry */ @@ -102,14 +102,14 @@ xfs_dir2_block_addname( return error; } ASSERT(bp != NULL); - hdr = bp->data; + hdr = bp->b_addr; /* * Check the magic number, corrupted if wrong. */ if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", XFS_ERRLEVEL_LOW, mp, hdr); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(EFSCORRUPTED); } len = xfs_dir2_data_entsize(args->namelen); @@ -212,7 +212,7 @@ xfs_dir2_block_addname( * If this isn't a real add, we're done with the buffer. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); /* * If we don't have space for the new entry & leaf ... */ @@ -228,7 +228,6 @@ xfs_dir2_block_addname( * Then add the new entry in that format. */ error = xfs_dir2_block_to_leaf(args, bp); - xfs_da_buf_done(bp); if (error) return error; return xfs_dir2_leaf_addname(args); @@ -422,7 +421,6 @@ xfs_dir2_block_addname( xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(tp, bp, dep); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } @@ -437,7 +435,7 @@ xfs_dir2_block_getdents( filldir_t filldir) { xfs_dir2_data_hdr_t *hdr; /* block header */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_dir2_data_unused_t *dup; /* block unused entry */ @@ -469,7 +467,7 @@ xfs_dir2_block_getdents( * We'll skip entries before this. */ wantoff = xfs_dir2_dataptr_to_off(mp, *offset); - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* * Set up values for the loop. @@ -514,7 +512,7 @@ xfs_dir2_block_getdents( cook & 0x7fffffff, be64_to_cpu(dep->inumber), DT_UNKNOWN)) { *offset = cook & 0x7fffffff; - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return 0; } } @@ -525,7 +523,7 @@ xfs_dir2_block_getdents( */ *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return 0; } @@ -535,17 +533,17 @@ xfs_dir2_block_getdents( static void xfs_dir2_block_log_leaf( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, /* block buffer */ int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); blp = xfs_dir2_block_leaf_p(btp); - xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } @@ -555,13 +553,13 @@ xfs_dir2_block_log_leaf( static void xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_buf *bp) /* block buffer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); - xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } @@ -575,7 +573,7 @@ xfs_dir2_block_lookup( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -593,7 +591,7 @@ xfs_dir2_block_lookup( return error; dp = args->dp; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -607,7 +605,7 @@ xfs_dir2_block_lookup( */ args->inumber = be64_to_cpu(dep->inumber); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); return XFS_ERROR(error); } @@ -617,13 +615,13 @@ xfs_dir2_block_lookup( static int /* error */ xfs_dir2_block_lookup_int( xfs_da_args_t *args, /* dir lookup arguments */ - xfs_dabuf_t **bpp, /* returned block buffer */ + struct xfs_buf **bpp, /* returned block buffer */ int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -647,7 +645,7 @@ xfs_dir2_block_lookup_int( return error; } ASSERT(bp != NULL); - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -666,7 +664,7 @@ xfs_dir2_block_lookup_int( high = mid - 1; if (low > high) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } } @@ -714,7 +712,7 @@ xfs_dir2_block_lookup_int( /* * No match, release the buffer and return ENOENT. */ - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -728,7 +726,7 @@ xfs_dir2_block_removename( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -753,7 +751,7 @@ xfs_dir2_block_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -790,10 +788,9 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) { - xfs_da_buf_done(bp); + if (size > XFS_IFORK_DSIZE(dp)) return 0; - } + /* * If it works, do the conversion. */ @@ -810,7 +807,7 @@ xfs_dir2_block_replace( { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -829,7 +826,7 @@ xfs_dir2_block_replace( } dp = args->dp; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -844,7 +841,6 @@ xfs_dir2_block_replace( dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_log_entry(args->trans, bp, dep); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } @@ -871,8 +867,8 @@ xfs_dir2_block_sort( int /* error */ xfs_dir2_leaf_to_block( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ - xfs_dabuf_t *dbp) /* data buffer */ + struct xfs_buf *lbp, /* leaf buffer */ + struct xfs_buf *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ xfs_dir2_data_hdr_t *hdr; /* block header */ @@ -898,7 +894,7 @@ xfs_dir2_leaf_to_block( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; + leaf = lbp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* @@ -914,11 +910,9 @@ xfs_dir2_leaf_to_block( if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) - goto out; - } else { - error = 0; - goto out; - } + return error; + } else + return 0; } /* * Read the data block if we don't already have it, give up if it fails. @@ -926,9 +920,9 @@ xfs_dir2_leaf_to_block( if (dbp == NULL && (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, XFS_DATA_FORK))) { - goto out; + return error; } - hdr = dbp->data; + hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); /* * Size of the "leaf" area in the block. @@ -944,10 +938,9 @@ xfs_dir2_leaf_to_block( * If it's not free or is too short we can't do it. */ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || - be16_to_cpu(dup->length) < size) { - error = 0; - goto out; - } + be16_to_cpu(dup->length) < size) + return 0; + /* * Start converting it to block form. */ @@ -989,25 +982,17 @@ xfs_dir2_leaf_to_block( * Pitch the old leaf block. */ error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); - lbp = NULL; - if (error) { - goto out; - } + if (error) + return error; + /* * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) { - error = 0; - goto out; - } + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); -out: - if (lbp) - xfs_da_buf_done(lbp); - if (dbp) - xfs_da_buf_done(dbp); - return error; } /* @@ -1020,7 +1005,7 @@ xfs_dir2_sf_to_block( xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ @@ -1088,7 +1073,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } - hdr = bp->data; + hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* * Compute size of block "tail" area. @@ -1217,6 +1202,5 @@ xfs_dir2_sf_to_block( xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 2046988e9eb..44ffd4d6bc9 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -42,8 +42,8 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); */ void xfs_dir2_data_check( - xfs_inode_t *dp, /* incore inode pointer */ - xfs_dabuf_t *bp) /* data block's buffer */ + struct xfs_inode *dp, /* incore inode pointer */ + struct xfs_buf *bp) /* data block's buffer */ { xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ xfs_dir2_data_free_t *bf; /* bestfree table */ @@ -65,7 +65,7 @@ xfs_dir2_data_check( struct xfs_name name; mp = dp->i_mount; - hdr = bp->data; + hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); @@ -389,9 +389,9 @@ int /* error */ xfs_dir2_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ - xfs_dabuf_t **bpp) /* output block buffer */ + struct xfs_buf **bpp) /* output block buffer */ { - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ @@ -417,7 +417,7 @@ xfs_dir2_data_init( /* * Initialize the header. */ - hdr = bp->data; + hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { @@ -449,16 +449,16 @@ xfs_dir2_data_init( */ void xfs_dir2_data_log_entry( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - (char *)hdr - 1)); } @@ -468,15 +468,15 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - xfs_da_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); } /* @@ -484,11 +484,11 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->data; + xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); @@ -496,13 +496,13 @@ xfs_dir2_data_log_unused( /* * Log the first part of the unused entry. */ - xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), + xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); @@ -514,8 +514,8 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ int *needlogp, /* out: log header */ @@ -531,7 +531,7 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ mp = tp->t_mountp; - hdr = bp->data; + hdr = bp->b_addr; /* * Figure out where the end of the data area is. @@ -696,8 +696,8 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* data block buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ xfs_dir2_data_aoff_t len, /* length to use */ @@ -713,7 +713,7 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ - hdr = bp->data; + hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 397ffbcbab1..69accf6cbc4 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -38,15 +38,15 @@ * Local function declarations. */ #ifdef DEBUG -static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_leaf_check(dp, bp) #endif -static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, - int *indexp, xfs_dabuf_t **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); +static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); /* @@ -55,7 +55,7 @@ static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); int /* error */ xfs_dir2_block_to_leaf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *dbp) /* input block's buffer */ + struct xfs_buf *dbp) /* input block's buffer */ { __be16 *bestsp; /* leaf's bestsp entries */ xfs_dablk_t blkno; /* leaf block's bno */ @@ -64,7 +64,7 @@ xfs_dir2_block_to_leaf( xfs_dir2_block_tail_t *btp; /* block's tail */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *lbp; /* leaf block's buffer */ + struct xfs_buf *lbp; /* leaf block's buffer */ xfs_dir2_db_t ldb; /* leaf block's bno */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ @@ -95,8 +95,8 @@ xfs_dir2_block_to_leaf( return error; } ASSERT(lbp != NULL); - leaf = lbp->data; - hdr = dbp->data; + leaf = lbp->b_addr; + hdr = dbp->b_addr; xfs_dir2_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -143,7 +143,6 @@ xfs_dir2_block_to_leaf( xfs_dir2_leaf_check(dp, lbp); xfs_dir2_data_check(dp, dbp); xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); - xfs_da_buf_done(lbp); return 0; } @@ -282,7 +281,7 @@ xfs_dir2_leaf_addname( __be16 *bestsp; /* freespace table in leaf */ int compact; /* need to compact leaves */ xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry */ @@ -291,7 +290,7 @@ xfs_dir2_leaf_addname( int highstale; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - xfs_dabuf_t *lbp; /* leaf's buffer */ + struct xfs_buf *lbp; /* leaf's buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ @@ -328,7 +327,7 @@ xfs_dir2_leaf_addname( * But if there are dup hash values the index is of the first of those. */ index = xfs_dir2_leaf_search_hash(args, lbp); - leaf = lbp->data; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); length = xfs_dir2_data_entsize(args->namelen); @@ -402,14 +401,13 @@ xfs_dir2_leaf_addname( */ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* * Convert to node form. */ error = xfs_dir2_leaf_to_node(args, lbp); - xfs_da_buf_done(lbp); if (error) return error; /* @@ -427,7 +425,7 @@ xfs_dir2_leaf_addname( * a new data block. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; } /* @@ -435,7 +433,7 @@ xfs_dir2_leaf_addname( * changed anything. */ if (args->total == 0 && use_block == -1) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* @@ -466,14 +464,14 @@ xfs_dir2_leaf_addname( */ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &use_block))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* * Initialize the block. */ if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* @@ -493,7 +491,7 @@ xfs_dir2_leaf_addname( */ else xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); - hdr = dbp->data; + hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; } @@ -505,10 +503,10 @@ xfs_dir2_leaf_addname( if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp, XFS_DATA_FORK))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } - hdr = dbp->data; + hdr = dbp->b_addr; grown = 0; } xfs_dir2_data_check(dp, dbp); @@ -570,9 +568,7 @@ xfs_dir2_leaf_addname( xfs_dir2_leaf_log_header(tp, lbp); xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); xfs_dir2_data_check(dp, dbp); - xfs_da_buf_done(dbp); return 0; } @@ -583,8 +579,8 @@ xfs_dir2_leaf_addname( */ STATIC void xfs_dir2_leaf_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf's buffer */ + struct xfs_inode *dp, /* incore directory inode */ + struct xfs_buf *bp) /* leaf's buffer */ { int i; /* leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -592,7 +588,7 @@ xfs_dir2_leaf_check( xfs_mount_t *mp; /* filesystem mount point */ int stale; /* count of stale leaves */ - leaf = bp->data; + leaf = bp->b_addr; mp = dp->i_mount; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); /* @@ -628,14 +624,14 @@ xfs_dir2_leaf_check( void xfs_dir2_leaf_compact( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ - leaf = bp->data; + leaf = bp->b_addr; if (!leaf->hdr.stale) { return; } @@ -677,7 +673,7 @@ xfs_dir2_leaf_compact( */ void xfs_dir2_leaf_compact_x1( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -693,7 +689,7 @@ xfs_dir2_leaf_compact_x1( int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); index = *indexp; @@ -775,7 +771,7 @@ xfs_dir2_leaf_getdents( xfs_off_t *offset, filldir_t filldir) { - xfs_dabuf_t *bp; /* data block buffer */ + struct xfs_buf *bp; /* data block buffer */ int byteoff; /* offset in current block */ xfs_dir2_db_t curdb; /* db for current block */ xfs_dir2_off_t curoff; /* current overall offset */ @@ -839,13 +835,13 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) { + if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { /* * If we have a buffer, we need to release it and * take it out of the mapping. */ if (bp) { - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); bp = NULL; map_blocks -= mp->m_dirblkfsbs; /* @@ -1035,7 +1031,7 @@ xfs_dir2_leaf_getdents( else if (curoff > newoff) ASSERT(xfs_dir2_byte_to_db(mp, curoff) == curdb); - hdr = bp->data; + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* * Find our position in the block. @@ -1119,7 +1115,7 @@ xfs_dir2_leaf_getdents( *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; kmem_free(map); if (bp) - xfs_da_brelse(NULL, bp); + xfs_trans_brelse(NULL, bp); return error; } @@ -1130,10 +1126,10 @@ int xfs_dir2_leaf_init( xfs_da_args_t *args, /* operation arguments */ xfs_dir2_db_t bno, /* directory block number */ - xfs_dabuf_t **bpp, /* out: leaf buffer */ + struct xfs_buf **bpp, /* out: leaf buffer */ int magic) /* magic number for block */ { - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1156,7 +1152,7 @@ xfs_dir2_leaf_init( return error; } ASSERT(bp != NULL); - leaf = bp->data; + leaf = bp->b_addr; /* * Initialize the header. */ @@ -1186,7 +1182,7 @@ xfs_dir2_leaf_init( static void xfs_dir2_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { @@ -1195,12 +1191,12 @@ xfs_dir2_leaf_log_bests( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; - xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1210,7 +1206,7 @@ xfs_dir2_leaf_log_bests( void xfs_dir2_leaf_log_ents( xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { @@ -1218,12 +1214,12 @@ xfs_dir2_leaf_log_ents( xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); firstlep = &leaf->ents[first]; lastlep = &leaf->ents[last]; - xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1232,15 +1228,15 @@ xfs_dir2_leaf_log_ents( */ void xfs_dir2_leaf_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), (uint)(sizeof(leaf->hdr) - 1)); } @@ -1249,18 +1245,18 @@ xfs_dir2_leaf_log_header( */ STATIC void xfs_dir2_leaf_log_tail( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ xfs_mount_t *mp; /* filesystem mount point */ mp = tp->t_mountp; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); - xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), + xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); } @@ -1273,12 +1269,12 @@ int xfs_dir2_leaf_lookup( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* found entry index */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ @@ -1294,7 +1290,7 @@ xfs_dir2_leaf_lookup( tp = args->trans; dp = args->dp; xfs_dir2_leaf_check(dp, lbp); - leaf = lbp->data; + leaf = lbp->b_addr; /* * Get to the leaf entry and contained data entry address. */ @@ -1303,15 +1299,15 @@ xfs_dir2_leaf_lookup( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + + ((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); /* * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(error); } @@ -1324,17 +1320,17 @@ xfs_dir2_leaf_lookup( static int /* error */ xfs_dir2_leaf_lookup_int( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t **lbpp, /* out: leaf buffer */ + struct xfs_buf **lbpp, /* out: leaf buffer */ int *indexp, /* out: index in leaf block */ - xfs_dabuf_t **dbpp) /* out: data buffer */ + struct xfs_buf **dbpp) /* out: data buffer */ { xfs_dir2_db_t curdb = -1; /* current data block number */ - xfs_dabuf_t *dbp = NULL; /* data buffer */ + struct xfs_buf *dbp = NULL; /* data buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index in leaf block */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ @@ -1354,7 +1350,7 @@ xfs_dir2_leaf_lookup_int( if (error) return error; *lbpp = lbp; - leaf = lbp->data; + leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); /* * Look for the first leaf entry with our hash value. @@ -1382,12 +1378,12 @@ xfs_dir2_leaf_lookup_int( */ if (newdb != curdb) { if (dbp) - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &dbp, XFS_DATA_FORK); if (error) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } xfs_dir2_data_check(dp, dbp); @@ -1396,7 +1392,7 @@ xfs_dir2_leaf_lookup_int( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)dbp->data + + dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare name and if it's an exact match, return the index @@ -1424,12 +1420,12 @@ xfs_dir2_leaf_lookup_int( if (args->cmpresult == XFS_CMP_CASE) { ASSERT(cidb != -1); if (cidb != curdb) { - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, cidb), -1, &dbp, XFS_DATA_FORK); if (error) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } } @@ -1441,8 +1437,8 @@ xfs_dir2_leaf_lookup_int( */ ASSERT(cidb == -1); if (dbp) - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOENT); } @@ -1456,13 +1452,13 @@ xfs_dir2_leaf_removename( __be16 *bestsp; /* leaf block best freespace */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry structure */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_db_t i; /* temporary data block # */ int index; /* index into leaf entries */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ @@ -1483,8 +1479,8 @@ xfs_dir2_leaf_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; - hdr = dbp->data; + leaf = lbp->b_addr; + hdr = dbp->b_addr; xfs_dir2_data_check(dp, dbp); /* * Point to the leaf entry, use that to point to the data entry. @@ -1541,12 +1537,9 @@ xfs_dir2_leaf_removename( * Just go on, returning success, leaving the * empty block in place. */ - if (error == ENOSPC && args->total == 0) { - xfs_da_buf_done(dbp); + if (error == ENOSPC && args->total == 0) error = 0; - } xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); return error; } dbp = NULL; @@ -1577,10 +1570,9 @@ xfs_dir2_leaf_removename( /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk && dbp != NULL) { - xfs_da_buf_done(dbp); + else if (db != mp->m_dirdatablk) dbp = NULL; - } + xfs_dir2_leaf_check(dp, lbp); /* * See if we can convert to block form. @@ -1595,12 +1587,12 @@ int /* error */ xfs_dir2_leaf_replace( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index of leaf entry */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ @@ -1614,7 +1606,7 @@ xfs_dir2_leaf_replace( return error; } dp = args->dp; - leaf = lbp->data; + leaf = lbp->b_addr; /* * Point to the leaf entry, get data address from it. */ @@ -1623,7 +1615,7 @@ xfs_dir2_leaf_replace( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + + ((char *)dbp->b_addr + xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* @@ -1632,9 +1624,8 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_da_buf_done(dbp); xfs_dir2_leaf_check(dp, lbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return 0; } @@ -1646,7 +1637,7 @@ xfs_dir2_leaf_replace( int /* index value */ xfs_dir2_leaf_search_hash( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_dahash_t hash=0; /* hash from this entry */ xfs_dahash_t hashwant; /* hash value looking for */ @@ -1656,7 +1647,7 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ - leaf = lbp->data; + leaf = lbp->b_addr; #ifndef __KERNEL__ if (!leaf->hdr.count) return 0; @@ -1699,11 +1690,11 @@ xfs_dir2_leaf_search_hash( int /* error */ xfs_dir2_leaf_trim_data( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ + struct xfs_buf *lbp, /* leaf buffer */ xfs_dir2_db_t db) /* data block number */ { __be16 *bestsp; /* leaf bests table */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1722,12 +1713,12 @@ xfs_dir2_leaf_trim_data( return error; } - leaf = lbp->data; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); #ifdef DEBUG { - struct xfs_dir2_data_hdr *hdr = dbp->data; + struct xfs_dir2_data_hdr *hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); ASSERT(be16_to_cpu(hdr->bestfree[0].length) == @@ -1741,7 +1732,7 @@ xfs_dir2_leaf_trim_data( */ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { ASSERT(error != ENOSPC); - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); return error; } /* @@ -1781,10 +1772,10 @@ xfs_dir2_node_to_leaf( xfs_da_args_t *args; /* operation arguments */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *fbp; /* buffer for freespace block */ + struct xfs_buf *fbp; /* buffer for freespace block */ xfs_fileoff_t fo; /* freespace file offset */ xfs_dir2_free_t *free; /* freespace structure */ - xfs_dabuf_t *lbp; /* buffer for leaf block */ + struct xfs_buf *lbp; /* buffer for leaf block */ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ @@ -1838,7 +1829,7 @@ xfs_dir2_node_to_leaf( if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) return 0; lbp = state->path.blk[0].bp; - leaf = lbp->data; + leaf = lbp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Read the freespace block. @@ -1847,7 +1838,7 @@ xfs_dir2_node_to_leaf( XFS_DATA_FORK))) { return error; } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); @@ -1857,7 +1848,7 @@ xfs_dir2_node_to_leaf( */ if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > mp->m_dirblksize) { - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); return 0; } diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index b0f26780449..6c705240660 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -36,20 +36,20 @@ /* * Function declarations. */ -static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index); +static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, + int index); #ifdef DEBUG -static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_leafn_check(dp, bp) #endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s, - int start_s, xfs_dabuf_t *bp_d, int start_d, - int count); +static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, + int start_s, struct xfs_buf *bp_d, + int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp, +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); static int xfs_dir2_node_addname_int(xfs_da_args_t *args, @@ -60,16 +60,16 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args, */ STATIC void xfs_dir2_free_log_bests( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* freespace buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_da_log_buf(tp, bp, + xfs_trans_log_buf(tp, bp, (uint)((char *)&free->bests[first] - (char *)free), (uint)((char *)&free->bests[last] - (char *)free + sizeof(free->bests[0]) - 1)); @@ -80,14 +80,14 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* freespace buffer */ + struct xfs_trans *tp, + struct xfs_buf *bp) { xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), + xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); } @@ -99,11 +99,11 @@ xfs_dir2_free_log_header( int /* error */ xfs_dir2_leaf_to_node( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_db_t fdb; /* freespace block number */ xfs_dir2_free_t *free; /* freespace structure */ __be16 *from; /* pointer to freespace entry */ @@ -136,8 +136,8 @@ xfs_dir2_leaf_to_node( return error; } ASSERT(fbp != NULL); - free = fbp->data; - leaf = lbp->data; + free = fbp->b_addr; + leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); /* * Initialize the freespace block header. @@ -164,7 +164,6 @@ xfs_dir2_leaf_to_node( xfs_dir2_leaf_log_header(tp, lbp); xfs_dir2_free_log_header(tp, fbp); xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_da_buf_done(fbp); xfs_dir2_leafn_check(dp, lbp); return 0; } @@ -175,7 +174,7 @@ xfs_dir2_leaf_to_node( */ static int /* error */ xfs_dir2_leafn_add( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int index) /* insertion pt for new entry */ { @@ -195,7 +194,7 @@ xfs_dir2_leafn_add( dp = args->dp; mp = dp->i_mount; tp = args->trans; - leaf = bp->data; + leaf = bp->b_addr; /* * Quick check just to make sure we are not going to index @@ -261,15 +260,15 @@ xfs_dir2_leafn_add( */ void xfs_dir2_leafn_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_inode *dp, + struct xfs_buf *bp) { int i; /* leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ int stale; /* count of stale leaves */ - leaf = bp->data; + leaf = bp->b_addr; mp = dp->i_mount; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); @@ -291,12 +290,12 @@ xfs_dir2_leafn_check( */ xfs_dahash_t /* hash value */ xfs_dir2_leafn_lasthash( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { xfs_dir2_leaf_t *leaf; /* leaf structure */ - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) *count = be16_to_cpu(leaf->hdr.count); @@ -311,12 +310,12 @@ xfs_dir2_leafn_lasthash( */ STATIC int xfs_dir2_leafn_lookup_for_addname( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ xfs_dir2_db_t curdb = -1; /* current data block number */ xfs_dir2_db_t curfdb = -1; /* current free block number */ xfs_inode_t *dp; /* incore directory inode */ @@ -335,7 +334,7 @@ xfs_dir2_leafn_lookup_for_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); @@ -352,7 +351,7 @@ xfs_dir2_leafn_lookup_for_addname( /* If so, it's a free block buffer, get the block number. */ curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; - free = curbp->data; + free = curbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); @@ -394,7 +393,7 @@ xfs_dir2_leafn_lookup_for_addname( * If we had one before, drop it. */ if (curbp) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); /* * Read the free block. */ @@ -403,7 +402,7 @@ xfs_dir2_leafn_lookup_for_addname( -1, &curbp, XFS_DATA_FORK); if (error) return error; - free = curbp->data; + free = curbp->b_addr; ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); ASSERT((be32_to_cpu(free->hdr.firstdb) % @@ -424,7 +423,7 @@ xfs_dir2_leafn_lookup_for_addname( XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; @@ -459,12 +458,12 @@ out: */ STATIC int xfs_dir2_leafn_lookup_for_entry( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp = NULL; /* current data/free buffer */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ xfs_dir2_db_t curdb = -1; /* current data block number */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ @@ -480,7 +479,7 @@ xfs_dir2_leafn_lookup_for_entry( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); #ifdef __KERNEL__ ASSERT(be16_to_cpu(leaf->hdr.count) > 0); @@ -525,7 +524,7 @@ xfs_dir2_leafn_lookup_for_entry( */ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || curdb != state->extrablk.blkno)) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); /* * If needing the block that is saved with a CI match, * use it otherwise read in the new data block. @@ -547,7 +546,7 @@ xfs_dir2_leafn_lookup_for_entry( /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)curbp->data + + dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); /* * Compare the entry and if it's an exact match, return @@ -559,7 +558,7 @@ xfs_dir2_leafn_lookup_for_entry( /* If there is a CI match block, drop it */ if (args->cmpresult != XFS_CMP_DIFFERENT && curdb != state->extrablk.blkno) - xfs_da_brelse(tp, state->extrablk.bp); + xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); *indexp = index; @@ -567,7 +566,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.bp = curbp; state->extrablk.blkno = curdb; state->extrablk.index = (int)((char *)dep - - (char *)curbp->data); + (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); @@ -586,7 +585,7 @@ xfs_dir2_leafn_lookup_for_entry( } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) - xfs_da_brelse(tp, curbp); + xfs_trans_brelse(tp, curbp); } } else { state->extravalid = 0; @@ -602,7 +601,7 @@ xfs_dir2_leafn_lookup_for_entry( */ int xfs_dir2_leafn_lookup_int( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ @@ -620,9 +619,9 @@ xfs_dir2_leafn_lookup_int( static void xfs_dir2_leafn_moveents( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp_s, /* source leaf buffer */ + struct xfs_buf *bp_s, /* source leaf buffer */ int start_s, /* source leaf index */ - xfs_dabuf_t *bp_d, /* destination leaf buffer */ + struct xfs_buf *bp_d, /* destination leaf buffer */ int start_d, /* destination leaf index */ int count) /* count of leaves to copy */ { @@ -640,8 +639,8 @@ xfs_dir2_leafn_moveents( return; } tp = args->trans; - leaf_s = bp_s->data; - leaf_d = bp_d->data; + leaf_s = bp_s->b_addr; + leaf_d = bp_d->b_addr; /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination @@ -702,14 +701,14 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */ - xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */ + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; + leaf1 = leaf1_bp->b_addr; + leaf2 = leaf2_bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (be16_to_cpu(leaf1->hdr.count) > 0 && @@ -757,8 +756,8 @@ xfs_dir2_leafn_rebalance( blk1 = blk2; blk2 = tmp; } - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); #ifdef DEBUG oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); @@ -834,14 +833,14 @@ xfs_dir2_leafn_rebalance( static int /* error */ xfs_dir2_leafn_remove( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int index, /* leaf entry index */ xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -858,7 +857,7 @@ xfs_dir2_leafn_remove( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* * Point to the entry we're removing. @@ -884,7 +883,7 @@ xfs_dir2_leafn_remove( * in the data block in case it changes. */ dbp = dblk->bp; - hdr = dbp->data; + hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); longest = be16_to_cpu(hdr->bestfree[0].length); needlog = needscan = 0; @@ -905,7 +904,7 @@ xfs_dir2_leafn_remove( */ if (longest < be16_to_cpu(hdr->bestfree[0].length)) { int error; /* error return value */ - xfs_dabuf_t *fbp; /* freeblock buffer */ + struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ @@ -920,7 +919,7 @@ xfs_dir2_leafn_remove( -1, &fbp, XFS_DATA_FORK))) { return error; } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == xfs_dir2_free_max_bests(mp) * @@ -948,9 +947,7 @@ xfs_dir2_leafn_remove( * In this case just drop the buffer and some one else * will eventually get rid of the empty block. */ - else if (error == ENOSPC && args->total == 0) - xfs_da_buf_done(dbp); - else + else if (!(error == ENOSPC && args->total == 0)) return error; } /* @@ -1018,11 +1015,6 @@ xfs_dir2_leafn_remove( */ if (logfree) xfs_dir2_free_log_bests(tp, fbp, findex, findex); - /* - * Drop the buffer if we still have it. - */ - if (fbp) - xfs_da_buf_done(fbp); } xfs_dir2_leafn_check(dp, bp); /* @@ -1114,7 +1106,7 @@ xfs_dir2_leafn_toosmall( { xfs_da_state_blk_t *blk; /* leaf block */ xfs_dablk_t blkno; /* leaf block number */ - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ int bytes; /* bytes in use */ int count; /* leaf live entry count */ int error; /* error return value */ @@ -1130,7 +1122,7 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->data; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); @@ -1189,7 +1181,7 @@ xfs_dir2_leafn_toosmall( leaf = (xfs_dir2_leaf_t *)info; count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes = state->blocksize - (state->blocksize >> 2); - leaf = bp->data; + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); bytes -= count * (uint)sizeof(leaf->ents[0]); @@ -1198,7 +1190,7 @@ xfs_dir2_leafn_toosmall( */ if (bytes >= 0) break; - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); } /* * Didn't like either block, give up. @@ -1207,11 +1199,7 @@ xfs_dir2_leafn_toosmall( *action = 0; return 0; } - /* - * Done with the sibling leaf block here, drop the dabuf - * so path_shift can get it. - */ - xfs_da_buf_done(bp); + /* * Make altpath point to the block we want to keep (the lower * numbered block) and path point to the block we want to drop. @@ -1247,8 +1235,8 @@ xfs_dir2_leafn_unbalance( args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); /* @@ -1356,13 +1344,13 @@ xfs_dir2_node_addname_int( { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t dbno; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ int error; /* error return value */ xfs_dir2_db_t fbno; /* freespace block number */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ int findex; /* freespace entry index */ xfs_dir2_free_t *free=NULL; /* freespace block structure */ xfs_dir2_db_t ifbno; /* initial freespace block no */ @@ -1390,7 +1378,7 @@ xfs_dir2_node_addname_int( * Remember initial freespace block number. */ ifbno = fblk->blkno; - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; /* @@ -1474,7 +1462,7 @@ xfs_dir2_node_addname_int( if (unlikely(fbp == NULL)) { continue; } - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } @@ -1492,7 +1480,7 @@ xfs_dir2_node_addname_int( /* * Drop the block. */ - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); fbp = NULL; if (fblk && fblk->bp) fblk->bp = NULL; @@ -1507,36 +1495,23 @@ xfs_dir2_node_addname_int( /* * Not allowed to allocate, return failure. */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || - args->total == 0) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); - } + /* * Allocate and initialize the new data block. */ if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + (error = xfs_dir2_data_init(args, dbno, &dbp)))) return error; - } + /* * If (somehow) we have a freespace block, get rid of it. */ if (fbp) - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); if (fblk && fblk->bp) fblk->bp = NULL; @@ -1547,10 +1522,9 @@ xfs_dir2_node_addname_int( fbno = xfs_dir2_db_to_fdb(mp, dbno); if (unlikely(error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { - xfs_da_buf_done(dbp); + XFS_DATA_FORK))) return error; - } + /* * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. @@ -1598,7 +1572,7 @@ xfs_dir2_node_addname_int( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - free = fbp->data; + free = fbp->b_addr; free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); free->hdr.firstdb = cpu_to_be32( (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * @@ -1606,7 +1580,7 @@ xfs_dir2_node_addname_int( free->hdr.nvalid = 0; free->hdr.nused = 0; } else { - free = fbp->data; + free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); } @@ -1639,7 +1613,7 @@ xfs_dir2_node_addname_int( * We haven't allocated the data entry yet so this will * change again. */ - hdr = dbp->data; + hdr = dbp->b_addr; free->bests[findex] = hdr->bestfree[0].length; logfree = 1; } @@ -1650,22 +1624,17 @@ xfs_dir2_node_addname_int( /* * If just checking, we succeeded. */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; - } + /* * Read the data block in. */ - if (unlikely( - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK))) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp, XFS_DATA_FORK); + if (error) return error; - } - hdr = dbp->data; + hdr = dbp->b_addr; logfree = 0; } ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); @@ -1713,17 +1682,11 @@ xfs_dir2_node_addname_int( */ if (logfree) xfs_dir2_free_log_bests(tp, fbp, findex, findex); - /* - * If the caller didn't hand us the freespace block, drop it. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); /* * Return the data block and offset in args, then drop the data block. */ args->blkno = (xfs_dablk_t)dbno; args->index = be16_to_cpu(*tagp); - xfs_da_buf_done(dbp); return 0; } @@ -1761,22 +1724,23 @@ xfs_dir2_node_lookup( /* If a CI match, dup the actual name and return EEXIST */ xfs_dir2_data_entry_t *dep; - dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp-> - data + state->extrablk.index); + dep = (xfs_dir2_data_entry_t *) + ((char *)state->extrablk.bp->b_addr + + state->extrablk.index); rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); } /* * Release the btree blocks and leaf block. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } /* * Release the data block if we have it. */ if (state->extravalid && state->extrablk.bp) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } xfs_da_state_free(state); @@ -1893,13 +1857,13 @@ xfs_dir2_node_replace( */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); - leaf = blk->bp->data; + leaf = blk->bp->b_addr; lep = &leaf->ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ - hdr = state->extrablk.bp->data; + hdr = state->extrablk.bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + @@ -1916,14 +1880,14 @@ xfs_dir2_node_replace( * Didn't find it, and we're holding a data block. Drop it. */ else if (state->extravalid) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } /* * Release all the buffers in the cursor. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } xfs_da_state_free(state); @@ -1940,7 +1904,7 @@ xfs_dir2_node_trim_free( xfs_fileoff_t fo, /* free block number */ int *rvalp) /* out: did something */ { - xfs_dabuf_t *bp; /* freespace buffer */ + struct xfs_buf *bp; /* freespace buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_free_t *free; /* freespace structure */ @@ -1965,13 +1929,13 @@ xfs_dir2_node_trim_free( if (bp == NULL) { return 0; } - free = bp->data; + free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* * If there are used entries, there's nothing to do. */ if (be32_to_cpu(free->hdr.nused) > 0) { - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; } @@ -1987,7 +1951,7 @@ xfs_dir2_node_trim_free( * pieces. This is the last block of an extent. */ ASSERT(error != ENOSPC); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return error; } /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 067f403ecf8..3523d3e15aa 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -25,7 +25,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_dabuf *bp); + struct xfs_buf *bp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); @@ -37,11 +37,11 @@ extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); + struct xfs_buf *lbp, struct xfs_buf *dbp); /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); +extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); #else #define xfs_dir2_data_check(dp,bp) #endif @@ -51,43 +51,43 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, - struct xfs_dabuf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf **bpp); +extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, +extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, +extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, - struct xfs_dabuf *dbp); + struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_dabuf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, + struct xfs_buf *bp); +extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_dabuf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, + struct xfs_buf **bpp, int magic); +extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); + struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, - struct xfs_dabuf *lbp); + struct xfs_buf *lbp); extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, - struct xfs_dabuf *lbp, xfs_dir2_db_t db); + struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, int lowstale, int highstale, @@ -96,13 +96,13 @@ extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); -extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, + struct xfs_buf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); +extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); extern int xfs_dir2_leafn_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); @@ -122,7 +122,7 @@ extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); -extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 19bf0c5e38f..1b9fc3ec7e4 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -222,7 +222,7 @@ xfs_dir2_block_sfsize( int /* error */ xfs_dir2_block_to_sf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, int size, /* shortform directory size */ xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { @@ -249,7 +249,7 @@ xfs_dir2_block_to_sf( * and add local data. */ hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->data, mp->m_dirblksize); + memcpy(hdr, bp->b_addr, mp->m_dirblksize); logflags = XFS_ILOG_CORE; if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { ASSERT(error != ENOSPC); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 425f6e9d4c0..07f70e17c74 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1491,13 +1491,9 @@ xfs_init_zones(void) if (!xfs_da_state_zone) goto out_destroy_btree_cur_zone; - xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); - if (!xfs_dabuf_zone) - goto out_destroy_da_state_zone; - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); if (!xfs_ifork_zone) - goto out_destroy_dabuf_zone; + goto out_destroy_da_state_zone; xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); if (!xfs_trans_zone) @@ -1560,8 +1556,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: kmem_zone_destroy(xfs_ifork_zone); - out_destroy_dabuf_zone: - kmem_zone_destroy(xfs_dabuf_zone); out_destroy_da_state_zone: kmem_zone_destroy(xfs_da_state_zone); out_destroy_btree_cur_zone: @@ -1589,7 +1583,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_dabuf_zone); kmem_zone_destroy(xfs_da_state_zone); kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); -- cgit v1.2.3 From 9b73bd7b61f320ffe7bda0126592ccf836d7ef90 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jun 2012 18:50:15 +1000 Subject: xfs: factor buffer reading from xfs_dir2_leaf_getdents The buffer reading code in xfs_dir2_leaf_getdents is complex and difficult to follow due to the readahead and all the context is carries. it is also badly indented and so difficult to read. Factor it out into a separate function to make it easier to understand and optimise in future patches. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 434 ++++++++++++++++++++++++++----------------------- 1 file changed, 232 insertions(+), 202 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 69accf6cbc4..0b296253bd0 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -759,6 +759,218 @@ xfs_dir2_leaf_compact_x1( *highstalep = highstale; } +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_inode *dp, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + int error = 0; + int length; + int i; + int j; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= mp->m_dirblkfsbs; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = mp->m_dirblkfsbs; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(mp, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + error = xfs_da_read_buf(NULL, dp, map->br_startoff, + map->br_blockcount >= mp->m_dirblkfsbs ? + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, + &bp, XFS_DATA_FORK); + + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= mp->m_dirblkfsbs; + + /* + * Do we need more readahead? + */ + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += mp->m_dirblkfsbs) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, + map[mip->ra_index].br_startblock + + mip->ra_offset), + (int)BTOBB(mp->m_dirblksize)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_da_reada_buf(NULL, dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, + XFS_DATA_FORK); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < mp->m_dirblkfsbs; j++) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, mp->m_dirblkfsbs, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + j += length; + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + +out: + *bpp = bp; + return error; +} + /* * Getdents (readdir) for leaf and node directories. * This reads the data blocks only, so is the same for both forms. @@ -771,30 +983,18 @@ xfs_dir2_leaf_getdents( xfs_off_t *offset, filldir_t filldir) { - struct xfs_buf *bp; /* data block buffer */ - int byteoff; /* offset in current block */ - xfs_dir2_db_t curdb; /* db for current block */ - xfs_dir2_off_t curoff; /* current overall offset */ + struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ int error = 0; /* error return value */ - int i; /* temporary loop index */ - int j; /* temporary loop index */ int length; /* temporary length value */ - xfs_bmbt_irec_t *map; /* map vector for blocks */ - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ xfs_mount_t *mp; /* filesystem mount point */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ xfs_dir2_off_t newoff; /* new curoff after new blk */ - int nmap; /* mappings to ask xfs_bmapi */ char *ptr = NULL; /* pointer to current data */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ + struct xfs_dir2_leaf_map_info *map_info; /* * If the offset is at or past the largest allowed value, @@ -810,10 +1010,12 @@ xfs_dir2_leaf_getdents( * buffer size, the directory block size, and the filesystem * block size. */ - map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize); - map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); - map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; - bp = NULL; + length = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP); + map_info->map_size = length; /* * Inside the loop we keep the main offset value as a byte offset @@ -825,7 +1027,9 @@ xfs_dir2_leaf_getdents( * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff)); + map_info->map_off = xfs_dir2_db_to_da(mp, + xfs_dir2_byte_to_db(mp, curoff)); + /* * Loop over directory entries until we reach the end offset. * Get more blocks and readahead as necessary. @@ -836,190 +1040,16 @@ xfs_dir2_leaf_getdents( * current buffer, need to get another one. */ if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { - /* - * If we have a buffer, we need to release it and - * take it out of the mapping. - */ - if (bp) { - xfs_trans_brelse(NULL, bp); - bp = NULL; - map_blocks -= mp->m_dirblkfsbs; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = mp->m_dirblkfsbs; i > 0; ) { - j = MIN((int)map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * - map_valid); - i -= j; - } - } - /* - * Recalculate the readahead blocks wanted. - */ - ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; - ASSERT(ra_want >= 0); - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + ra_want > map_blocks && - map_off < - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - nmap = map_size - map_valid; - error = xfs_bmapi_read(dp, map_off, - xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET) - map_off, - &map[map_valid], &nmap, 0); - /* - * Don't know if we should ignore this or - * try to return an error. - * The trouble with returning errors - * is that readdir will just stop without - * actually passing the error through. - */ - if (error) - break; /* XXX */ - /* - * If we got all the mappings we asked for, - * set the final map offset based on the - * last bmap value received. - * Otherwise, we've reached the end. - */ - if (nmap == map_size - map_valid) - map_off = - map[map_valid + nmap - 1].br_startoff + - map[map_valid + nmap - 1].br_blockcount; - else - map_off = - xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET); - /* - * Look for holes in the mapping, and - * eliminate them. Count up the valid blocks. - */ - for (i = map_valid; i < map_valid + nmap; ) { - if (map[i].br_startblock == - HOLESTARTBLOCK) { - nmap--; - length = map_valid + nmap - i; - if (length) - memmove(&map[i], - &map[i + 1], - sizeof(map[i]) * - length); - } else { - map_blocks += - map[i].br_blockcount; - i++; - } - } - map_valid += nmap; - } - /* - * No valid mappings, so no more data blocks. - */ - if (!map_valid) { - curoff = xfs_dir2_da_to_byte(mp, map_off); + error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) break; - } - /* - * Read the directory block starting at the first - * mapping. - */ - curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : - -1, - &bp, XFS_DATA_FORK); - /* - * Should just skip over the data block instead - * of giving up. - */ - if (error) - break; /* XXX */ - /* - * Adjust the current amount of read-ahead: we just - * read a block that was previously ra. - */ - if (ra_current) - ra_current -= mp->m_dirblkfsbs; - /* - * Do we need more readahead? - */ - for (ra_index = ra_offset = i = 0; - ra_want > ra_current && i < map_blocks; - i += mp->m_dirblkfsbs) { - ASSERT(ra_index < map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > ra_current && - map[ra_index].br_blockcount >= - mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, - map[ra_index].br_startblock + - ra_offset), - (int)BTOBB(mp->m_dirblksize)); - ra_current = i; - } - /* - * Read-ahead a non-contiguous directory block. - * This doesn't use our mapping, but this - * is a very rare case. - */ - else if (i > ra_current) { - (void)xfs_da_reada_buf(NULL, dp, - map[ra_index].br_startoff + - ra_offset, XFS_DATA_FORK); - ra_current = i; - } - /* - * Advance offset through the mapping table. - */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { - /* - * The rest of this extent but not - * more than a dir block. - */ - length = MIN(mp->m_dirblkfsbs, - (int)(map[ra_index].br_blockcount - - ra_offset)); - j += length; - ra_offset += length; - /* - * Advance to the next mapping if - * this one is used up. - */ - if (ra_offset == - map[ra_index].br_blockcount) { - ra_offset = 0; - ra_index++; - } - } - } + /* * Having done a read, we need to set a new offset. */ - newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0); + newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); /* * Start of the current block. */ @@ -1030,7 +1060,7 @@ xfs_dir2_leaf_getdents( */ else if (curoff > newoff) ASSERT(xfs_dir2_byte_to_db(mp, curoff) == - curdb); + map_info->curdb); hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); /* @@ -1113,7 +1143,7 @@ xfs_dir2_leaf_getdents( *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; - kmem_free(map); + kmem_free(map_info); if (bp) xfs_trans_brelse(NULL, bp); return error; -- cgit v1.2.3 From 1f432a887e9a5a5c25be6ac72b5da13652c8bed3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Jul 2012 07:40:42 +1000 Subject: xfs: really fix the cursor leak in xfs_alloc_ag_vextent_near The current cursor is reallocated when retrying the allocation, so the existing cursor needs to be destroyed in both the restart and the failure cases. Signed-off-by: Dave Chinner Tested-by: Mike Snitzer Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 9d1aeb7e273..f654f51b0c6 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1074,13 +1074,13 @@ restart: * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + if (!forced++) { trace_xfs_alloc_near_busy(args); xfs_log_force(args->mp, XFS_LOG_SYNC); goto restart; } - - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; -- cgit v1.2.3 From eb71a12e411fe065f8663e12a8d81d561f9502ee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 12 Jul 2012 07:40:43 +1000 Subject: xfs: don't defer metadata allocation to the workqueue Almost all metadata allocations come from shallow stack usage situations. Avoid the overhead of switching the allocation to a workqueue as we are not in danger of running out of stack when making these allocations. Metadata allocations are already marked through the args that are passed down, so this is trivial to do. Signed-off-by: Dave Chinner Reported-by: Mel Gorman Tested-by: Mel Gorman Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index f654f51b0c6..4f33c32affe 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2434,13 +2434,22 @@ xfs_alloc_vextent_worker( current_restore_flags_nested(&pflags, PF_FSTRANS); } - -int /* error */ +/* + * Data allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Metadata + * requests, OTOH, are generally from low stack usage paths, so avoid the + * context switch overhead here. + */ +int xfs_alloc_vextent( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args) { DECLARE_COMPLETION_ONSTACK(done); + if (!args->userdata) + return __xfs_alloc_vextent(args); + + args->done = &done; INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); queue_work(xfs_alloc_wq, &args->work); -- cgit v1.2.3 From 08023d6dbe840dc4271805a9ea376fcbdee9f744 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Jul 2012 06:00:04 -0400 Subject: xfs: prevent recursion in xfs_buf_iorequest If the b_iodone handler is run in calling context in xfs_buf_iorequest we can run into a recursion where xfs_buf_iodone_callbacks keeps calling back into xfs_buf_iorequest because an I/O error happened, which keeps calling back into xfs_buf_iorequest. This chain will usually not take long because the filesystem gets shut down because of log I/O errors, but even over a short time it can cause stack overflows if run on the same context. As a short term workaround make sure we always call the iodone handler in workqueue context. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 39c5d7622de..d1edfa1a811 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1355,7 +1355,7 @@ xfs_buf_iorequest( */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 0); + _xfs_buf_ioend(bp, 1); xfs_buf_rele(bp); } -- cgit v1.2.3 From a2dcf5df5f3813a44423d4a5026666e751ec00dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jul 2012 02:24:10 -0400 Subject: xfs: do not call xfs_bdstrat_cb in xfs_buf_iodone_callbacks xfs_bdstrat_cb only adds a check for a shutdown filesystem over xfs_buf_iorequest, but xfs_buf_iodone_callbacks just checked for a shut down filesystem a little earlier. In addition the shutdown handling in xfs_bdstrat_cb is not very suitable for this caller. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 51 ++++++++++++++++++++++----------------------------- fs/xfs/xfs_buf.h | 1 - fs/xfs/xfs_buf_item.c | 2 +- 3 files changed, 23 insertions(+), 31 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d1edfa1a811..d7a9dd735e1 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1049,27 +1049,6 @@ xfs_buf_ioerror_alert( (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } -int -xfs_bwrite( - struct xfs_buf *bp) -{ - int error; - - ASSERT(xfs_buf_islocked(bp)); - - bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); - - xfs_bdstrat_cb(bp); - - error = xfs_buf_iowait(bp); - if (error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); - } - return error; -} - /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend @@ -1139,14 +1118,7 @@ xfs_bioerror_relse( return EIO; } - -/* - * All xfs metadata buffers except log state machine buffers - * get this attached as their b_bdstrat callback function. - * This is so that we can catch a buffer - * after prematurely unpinning it to forcibly shutdown the filesystem. - */ -int +STATIC int xfs_bdstrat_cb( struct xfs_buf *bp) { @@ -1167,6 +1139,27 @@ xfs_bdstrat_cb( return 0; } +int +xfs_bwrite( + struct xfs_buf *bp) +{ + int error; + + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + + xfs_bdstrat_cb(bp); + + error = xfs_buf_iowait(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + return error; +} + /* * Wrapper around bdstrat so that we can stop data from going to disk in case * we are shutting down the filesystem. Typically user data goes thru this diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index aa96bd410ae..d03b73b9604 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -250,7 +250,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e4a6e4b6fa0..a8d0ed91119 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1101,7 +1101,7 @@ xfs_buf_iodone_callbacks( if (!XFS_BUF_ISSTALE(bp)) { bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; - xfs_bdstrat_cb(bp); + xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); } -- cgit v1.2.3 From 00cd8dd3bf95f2cc8435b4cac01d9995635c6d0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 17:13:09 -0400 Subject: stop passing nameidata to ->lookup() Just the flags; only NFS cares even about that, but there are legitimate uses for such argument. And getting rid of that completely would require splitting ->lookup() into a couple of methods (at least), so let's leave that alone for now... Signed-off-by: Al Viro --- fs/xfs/xfs_iops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1a25fd80279..b41cfba14fa 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -197,7 +197,7 @@ STATIC struct dentry * xfs_vn_lookup( struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct xfs_inode *cip; struct xfs_name name; @@ -222,7 +222,7 @@ STATIC struct dentry * xfs_vn_ci_lookup( struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct xfs_inode *ip; struct xfs_name xname; -- cgit v1.2.3 From ebfc3b49a7ac25920cb5be5445f602e51d2ea559 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 18:05:36 -0400 Subject: don't pass nameidata to ->create() boolean "does it have to be exclusive?" flag is passed instead; Local filesystem should just ignore it - the object is guaranteed not to be there yet. Signed-off-by: Al Viro --- fs/xfs/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b41cfba14fa..9c4340f5c3e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -179,7 +179,7 @@ xfs_vn_create( struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool flags) { return xfs_vn_mknod(dir, dentry, mode, 0); } -- cgit v1.2.3 From 96ee34be7a0965a117080a28b2c0cc6d4ee6adb1 Mon Sep 17 00:00:00 2001 From: Chen Baozi Date: Sat, 14 Jul 2012 03:38:13 +0800 Subject: xfs: fix comment typo of struct xfs_da_blkinfo. Fix trivial typo error that has written "It" to "Is". Signed-off-by: Chen Baozi Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_da_btree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 9f37aa03eb3..132adafb041 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -32,7 +32,7 @@ struct zone; /* * This structure is common to both leaf nodes and non-leaf nodes in the Btree. * - * Is is used to manage a doubly linked list of all blocks at the same + * It is used to manage a doubly linked list of all blocks at the same * level in the Btree, and to identify which type of block this is. */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ -- cgit v1.2.3 From 69ff2826117f1cde9a2491be57a578212bca551e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jun 2012 17:01:28 -0400 Subject: xfs: implement ->update_time Use this new method to replace our hacky use of ->dirty_inode. An additional benefit is that we can now propagate errors up the stack. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_iops.c | 45 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_super.c | 56 ------------------------------------------------------ fs/xfs/xfs_trace.h | 2 +- 3 files changed, 46 insertions(+), 57 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1a25fd80279..5439c3f9945 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -897,6 +897,47 @@ xfs_vn_setattr( return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); } +STATIC int +xfs_vn_update_time( + struct inode *inode, + struct timespec *now, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + trace_xfs_update_time(ip); + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & S_CTIME) { + inode->i_ctime = *now; + ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_MTIME) { + inode->i_mtime = *now; + ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_ATIME) { + inode->i_atime = *now; + ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; + } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + return -xfs_trans_commit(tp, 0); +} + #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -991,6 +1032,7 @@ static const struct inode_operations xfs_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .fiemap = xfs_vn_fiemap, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_dir_inode_operations = { @@ -1016,6 +1058,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1041,6 +1084,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; static const struct inode_operations xfs_symlink_inode_operations = { @@ -1054,6 +1098,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, }; STATIC void diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 07f70e17c74..cb2deb13b06 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -868,61 +868,6 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -/* - * This is called by the VFS when dirtying inode metadata. This can happen - * for a few reasons, but we only care about timestamp updates, given that - * we handled the rest ourselves. In theory no other calls should happen, - * but for example generic_write_end() keeps dirtying the inode after - * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC, - * and skip this call otherwise. - * - * We'll hopefull get a different method just for updating timestamps soon, - * at which point this hack can go away, and maybe we'll also get real - * error handling here. - */ -STATIC void -xfs_fs_dirty_inode( - struct inode *inode, - int flags) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - if (flags != I_DIRTY_SYNC) - return; - - trace_xfs_dirty_inode(ip); - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto trouble; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - /* - * Grab all the latest timestamps from the Linux inode. - */ - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; - ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - error = xfs_trans_commit(tp, 0); - if (error) - goto trouble; - return; - -trouble: - xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino); -} - STATIC void xfs_fs_evict_inode( struct inode *inode) @@ -1436,7 +1381,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .dirty_inode = xfs_fs_dirty_inode, .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index caf5dabfd55..e5795dd6013 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -578,8 +578,8 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_dirty_inode); DEFINE_INODE_EVENT(xfs_evict_inode); +DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); DEFINE_INODE_EVENT(xfs_dquot_dqdetach); -- cgit v1.2.3 From 6b7a03f03a2f8b1629133e35729eba4727fae3cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jul 2012 12:20:00 -0400 Subject: xfs: handle EOF correctly in xfs_vm_writepage We need to zero out part of a page which beyond EOF before setting uptodate, otherwise, mapread or write will see non-zero data beyond EOF. Based on the code in fs/buffer.c and the following ext4 commit: ext4: handle EOF correctly in ext4_bio_write_page() And yes, I wish we had a good test case for it. Signed-off-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 84e372596d5..91d77ac51bb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -927,11 +927,26 @@ xfs_vm_writepage( end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; if (page->index >= end_index) { - if ((page->index >= end_index + 1) || - !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + + /* + * Just skip the page if it is fully outside i_size, e.g. due + * to a truncate operation that is in progress. + */ + if (page->index >= end_index + 1 || offset_into_page == 0) { unlock_page(page); return 0; } + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); } end_offset = min_t(unsigned long long, -- cgit v1.2.3 From 475ee413f34165f8e6fcd7fbff3a4da1dce48c99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jul 2012 12:21:22 -0400 Subject: xfs: merge xfs_itobp into xfs_imap_to_bp All callers of xfs_imap_to_bp want the dinode pointer, so let's calculate it inside xfs_imap_to_bp. Once that is done xfs_itobp becomes a fairly pointless wrapper which can be replaced with direct calls to xfs_imap_to_bp. Signed-off-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 131 +++++++++++++++++------------------------------ fs/xfs/xfs_inode.h | 6 +-- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_sync.c | 4 +- 5 files changed, 54 insertions(+), 91 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 257f3c463e0..34c985de5fa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,23 +132,28 @@ xfs_inobp_check( #endif /* - * Find the buffer associated with the given inode map - * We do basic validation checks on the buffer once it has been - * retrieved from disk. + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. */ -STATIC int +int xfs_imap_to_bp( - xfs_mount_t *mp, - xfs_trans_t *tp, - struct xfs_imap *imap, - xfs_buf_t **bpp, - uint buf_flags, - uint iget_flags) + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) { - int error; - int i; - int ni; - xfs_buf_t *bp; + struct xfs_buf *bp; + int error; + int i; + int ni; buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, @@ -189,8 +194,8 @@ xfs_imap_to_bp( xfs_trans_brelse(tp, bp); return XFS_ERROR(EINVAL); } - XFS_CORRUPTION_ERROR("xfs_imap_to_bp", - XFS_ERRLEVEL_HIGH, mp, dip); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); #ifdef DEBUG xfs_emerg(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", @@ -204,7 +209,9 @@ xfs_imap_to_bp( } xfs_inobp_check(mp, bp); + *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; } @@ -240,63 +247,15 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, dipp, &bp, 0, imap_flags); if (error) return error; - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; *offset = imap.im_boffset; return 0; } - -/* - * This routine is called to map an inode to the buffer containing - * the on-disk version of the inode. It returns a pointer to the - * buffer containing the on-disk inode in the bpp parameter, and in - * the dip parameter it returns a pointer to the on-disk inode within - * that buffer. - * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * The inode is expected to already been mapped to its buffer and read - * in once, thus we can use the mapping information stored in the inode - * rather than calling xfs_imap(). This allows us to avoid the overhead - * of looking at the inode btree for small block file systems - * (see xfs_imap()). - */ -int -xfs_itobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - uint buf_flags) -{ - xfs_buf_t *bp; - int error; - - ASSERT(ip->i_imap.im_blkno != 0); - - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); - if (error) - return error; - - if (!bp) { - ASSERT(buf_flags & XBF_TRYLOCK); - ASSERT(tp == NULL); - *bpp = NULL; - return EAGAIN; - } - - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); - *bpp = bp; - return 0; -} - /* * Move inode type and inode format specific information from the * on-disk inode to the in-core inode. For fifos, devs, and sockets @@ -796,10 +755,9 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); if (error) return error; - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); /* * If we got something that isn't an inode it means someone @@ -876,7 +834,7 @@ xfs_iread( /* * Use xfs_trans_brelse() to release the buffer containing the * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_itobp() above. If tp is NULL, this is just a normal + * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal * brelse(). If we're within a transaction, then xfs_trans_brelse() * will only release the buffer if it is not dirty within the * transaction. It will be OK to release the buffer in this case, @@ -1355,7 +1313,8 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) return error; @@ -1429,16 +1388,16 @@ xfs_iunlink_remove( if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { /* - * We're at the head of the list. Get the inode's - * on-disk buffer to see if there is anyone after us - * on the list. Only modify our next pointer if it - * is not already NULLAGINO. This saves us the overhead - * of dealing with the buffer when there is no need to - * change it. + * We're at the head of the list. Get the inode's on-disk + * buffer to see if there is anyone after us on the list. + * Only modify our next pointer if it is not already NULLAGINO. + * This saves us the overhead of dealing with the buffer when + * there is no need to change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - xfs_warn(mp, "%s: xfs_itobp() returned error %d.", + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } @@ -1493,13 +1452,15 @@ xfs_iunlink_remove( ASSERT(next_agino != NULLAGINO); ASSERT(next_agino != 0); } + /* - * Now last_ibp points to the buffer previous to us on - * the unlinked list. Pull us from the list. + * Now last_ibp points to the buffer previous to us on the + * unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", + xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", __func__, error); return error; } @@ -1749,7 +1710,8 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) return error; @@ -2428,7 +2390,7 @@ xfs_iflush( /* * For stale inodes we cannot rely on the backing buffer remaining * stale in cache for the remaining life of the stale inode and so - * xfs_itobp() below may give us a buffer that no longer contains + * xfs_imap_to_bp() below may give us a buffer that no longer contains * inodes below. We have to check this after ensuring the inode is * unpinned so that it is safe to reclaim the stale inode after the * flush call. @@ -2454,7 +2416,8 @@ xfs_iflush( /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, + 0); if (error || !bp) { xfs_ifunlock(ip); return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1efff36a75b..942fd7f9110 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -560,9 +560,9 @@ do { \ int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, struct xfs_dinode **, struct xfs_buf **, int *, uint); -int xfs_itobp(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, struct xfs_dinode **, - struct xfs_buf **, uint); +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_dinode_to_disk(struct xfs_dinode *, diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index eff577a9b67..01d10a66e30 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -555,7 +555,7 @@ xfs_bulkstat_single( /* * note that requesting valid inode numbers which are not allocated - * to inodes will most likely cause xfs_itobp to generate warning + * to inodes will most likely cause xfs_imap_to_bp to generate warning * messages about bad magic numbers. This is ok. The fact that * the inode isn't actually an inode is handled by the * error check below. Done this way to make the usual case faster diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a76ba886e73..5da3ace352b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3106,7 +3106,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 1e9ee064dbb..e61fc151907 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -712,8 +712,8 @@ restart: * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer would result - * in an ABBA deadlock with xfs_ifree_cluster(). + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want -- cgit v1.2.3 From 129dbc9a2d93bab823e57fe47f53d098a0d350f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jul 2012 12:21:51 -0400 Subject: xfs: remove xfs_inotobp There is no need to keep this helper around, opencoding it in the only caller is just as clear. Signed-off-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 70 +++++++++++++++--------------------------------------- fs/xfs/xfs_inode.h | 3 --- 2 files changed, 19 insertions(+), 54 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 34c985de5fa..d48e406de07 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -215,47 +215,6 @@ xfs_imap_to_bp( return 0; } -/* - * This routine is called to map an inode number within a file - * system to the buffer containing the on-disk version of the - * inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dip parameter - * it returns a pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * Use xfs_imap() to determine the size and location of the - * buffer to read from disk. - */ -int -xfs_inotobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - int *offset, - uint imap_flags) -{ - struct xfs_imap imap; - xfs_buf_t *bp; - int error; - - imap.im_blkno = 0; - error = xfs_imap(mp, tp, ino, &imap, imap_flags); - if (error) - return error; - - error = xfs_imap_to_bp(mp, tp, &imap, dipp, &bp, 0, imap_flags); - if (error) - return error; - - *bpp = bp; - *offset = imap.im_boffset; - return 0; -} - /* * Move inode type and inode format specific information from the * on-disk inode to the in-core inode. For fifos, devs, and sockets @@ -1431,23 +1390,32 @@ xfs_iunlink_remove( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); last_ibp = NULL; while (next_agino != agino) { - /* - * If the last inode wasn't the one pointing to - * us, then release its buffer since we're not - * going to do anything with it. - */ - if (last_ibp != NULL) { + struct xfs_imap imap; + + if (last_ibp) xfs_trans_brelse(tp, last_ibp); - } + + imap.im_blkno = 0; next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); - error = xfs_inotobp(mp, tp, next_ino, &last_dip, - &last_ibp, &last_offset, 0); + + error = xfs_imap(mp, tp, next_ino, &imap, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, + &last_ibp, 0, 0); if (error) { xfs_warn(mp, - "%s: xfs_inotobp() returned error %d.", + "%s: xfs_imap_to_bp returned error %d.", __func__, error); return error; } + + last_offset = imap.im_boffset; next_agino = be32_to_cpu(last_dip->di_next_unlinked); ASSERT(next_agino != NULLAGINO); ASSERT(next_agino != 0); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 942fd7f9110..c2e2da3abae 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -557,9 +557,6 @@ do { \ #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 -int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, - xfs_ino_t, struct xfs_dinode **, - struct xfs_buf **, int *, uint); int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_imap *, struct xfs_dinode **, struct xfs_buf **, uint, uint); -- cgit v1.2.3 From 0d882a360b9012bc7a7e921c935774c3fba1bfd9 Mon Sep 17 00:00:00 2001 From: Alain Renaud Date: Tue, 22 May 2012 15:56:21 -0500 Subject: Prefix IO_XX flags with XFS_IO_XX to avoid namespace colision. Add a XFS_ prefix to IO_DIRECT,XFS_IO_DELALLOC, XFS_IO_UNWRITTEN and XFS_IO_OVERWRITE. This to avoid namespace conflict with other modules. Signed-off-by: Alain Renaud Reviewed-by: Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 48 ++++++++++++++++++++++++------------------------ fs/xfs/xfs_aops.h | 14 +++++++------- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 91d77ac51bb..15052ff916e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -179,7 +179,7 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -210,7 +210,7 @@ xfs_end_io( * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IO_UNWRITTEN) { + if (ioend->io_type == XFS_IO_UNWRITTEN) { /* * For buffered I/O we never preallocate a transaction when * doing the unwritten extent conversion, but for direct I/O @@ -312,7 +312,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (type == IO_UNWRITTEN) + if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { @@ -336,7 +336,7 @@ xfs_map_blocks( if (error) return -XFS_ERROR(error); - if (type == IO_DELALLOC && + if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { error = xfs_iomap_write_allocate(ip, offset, count, imap); if (!error) @@ -345,7 +345,7 @@ xfs_map_blocks( } #ifdef DEBUG - if (type == IO_UNWRITTEN) { + if (type == XFS_IO_UNWRITTEN) { ASSERT(nimaps); ASSERT(imap->br_startblock != HOLESTARTBLOCK); ASSERT(imap->br_startblock != DELAYSTARTBLOCK); @@ -634,11 +634,11 @@ xfs_check_page_type( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable += (type == IO_UNWRITTEN); + acceptable += (type == XFS_IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable += (type == IO_DELALLOC); + acceptable += (type == XFS_IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == IO_OVERWRITE); + acceptable += (type == XFS_IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -721,11 +721,11 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh) || buffer_mapped(bh)) { if (buffer_unwritten(bh)) - type = IO_UNWRITTEN; + type = XFS_IO_UNWRITTEN; else if (buffer_delay(bh)) - type = IO_DELALLOC; + type = XFS_IO_DELALLOC; else - type = IO_OVERWRITE; + type = XFS_IO_OVERWRITE; if (!xfs_imap_valid(inode, imap, offset)) { done = 1; @@ -733,7 +733,7 @@ xfs_convert_page( } lock_buffer(bh); - if (type != IO_OVERWRITE) + if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -831,7 +831,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -956,7 +956,7 @@ xfs_vm_writepage( bh = head = page_buffers(page); offset = page_offset(page); - type = IO_OVERWRITE; + type = XFS_IO_OVERWRITE; if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; @@ -981,18 +981,18 @@ xfs_vm_writepage( } if (buffer_unwritten(bh)) { - if (type != IO_UNWRITTEN) { - type = IO_UNWRITTEN; + if (type != XFS_IO_UNWRITTEN) { + type = XFS_IO_UNWRITTEN; imap_valid = 0; } } else if (buffer_delay(bh)) { - if (type != IO_DELALLOC) { - type = IO_DELALLOC; + if (type != XFS_IO_DELALLOC) { + type = XFS_IO_DELALLOC; imap_valid = 0; } } else if (buffer_uptodate(bh)) { - if (type != IO_OVERWRITE) { - type = IO_OVERWRITE; + if (type != XFS_IO_OVERWRITE) { + type = XFS_IO_OVERWRITE; imap_valid = 0; } } else { @@ -1028,7 +1028,7 @@ xfs_vm_writepage( } if (imap_valid) { lock_buffer(bh); - if (type != IO_OVERWRITE) + if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &imap, offset); xfs_add_to_ioend(inode, bh, offset, type, &ioend, new_ioend); @@ -1069,7 +1069,7 @@ xfs_vm_writepage( * Reserve log space if we might write beyond the on-disk * inode size. */ - if (ioend->io_type != IO_UNWRITTEN && + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) { err = xfs_setfilesize_trans_alloc(ioend); if (err) @@ -1366,7 +1366,7 @@ xfs_end_io_direct_write( ioend->io_iocb = iocb; ioend->io_result = ret; if (private && size > 0) - ioend->io_type = IO_UNWRITTEN; + ioend->io_type = XFS_IO_UNWRITTEN; if (is_async) { ioend->io_isasync = 1; @@ -1398,7 +1398,7 @@ xfs_vm_direct_IO( * and converts at least on unwritten extent we will cancel * the still clean transaction after the I/O has finished. */ - iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT); + iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); if (offset + size > XFS_I(inode)->i_d.di_size) { ret = xfs_setfilesize_trans_alloc(ioend); if (ret) diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 84eafbcb0d9..c325abb8d61 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -24,17 +24,17 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - IO_DIRECT = 0, /* special case for direct I/O ioends */ - IO_DELALLOC, /* mapping covers delalloc region */ - IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ - IO_OVERWRITE, /* mapping covers already allocated extent */ + XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */ + XFS_IO_DELALLOC, /* covers delalloc region */ + XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ + XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ { 0, "" }, \ - { IO_DELALLOC, "delalloc" }, \ - { IO_UNWRITTEN, "unwritten" }, \ - { IO_OVERWRITE, "overwrite" } + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" } /* * xfs_ioend struct manages large extent writes for XFS. -- cgit v1.2.3 From 824c313139c2ce678011bf11c4823a0c99651c1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 10:54:45 -0400 Subject: xfs: remove xfs_ialloc_find_free This function is entirely trivial and only has one caller, so remove it to simplify the code. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 177a21a7ac4..30b816d1f7e 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -609,13 +609,6 @@ xfs_ialloc_get_rec( /* * Visible inode allocation functions. */ -/* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} /* * Allocate an inode on disk. @@ -995,7 +988,7 @@ newino: } alloc_inode: - offset = xfs_ialloc_find_free(&rec.ir_free); + offset = xfs_lowbit64(rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % -- cgit v1.2.3 From 765927b2d508712d320c8934db963bbe14c3fcec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jun 2012 21:58:53 +0400 Subject: switch dentry_open() to struct path, make it grab references itself Signed-off-by: Al Viro --- fs/xfs/xfs_ioctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3a05a41b5d7..1f1535d25a9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -208,6 +208,7 @@ xfs_open_by_handle( struct inode *inode; struct dentry *dentry; fmode_t fmode; + struct path path; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -252,8 +253,10 @@ xfs_open_by_handle( goto out_dput; } - filp = dentry_open(dentry, mntget(parfilp->f_path.mnt), - hreq->oflags, cred); + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); if (IS_ERR(filp)) { put_unused_fd(fd); return PTR_ERR(filp); -- cgit v1.2.3 From f2ecc5e453134a13c3b2b0f2cac52ab2d5c540d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 10:54:46 -0400 Subject: xfs: split xfs_dialloc Move the actual allocation once we have selected an allocation group into a separate helper, and make xfs_dialloc a wrapper around it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 349 ++++++++++++++++++++++++++-------------------------- 1 file changed, 174 insertions(+), 175 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 30b816d1f7e..a124b9f88aa 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -607,188 +607,35 @@ xfs_ialloc_get_rec( } /* - * Visible inode allocation functions. - */ - -/* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. + * Allocate an inode. * - * The arguments IO_agbp and alloc_done are defined to work within - * the constraint of one allocation per transaction. - * xfs_dialloc() is designed to be called twice if it has to do an - * allocation to make more free inodes. On the first call, - * IO_agbp should be set to NULL. If an inode is available, - * i.e., xfs_dialloc() did not need to do an allocation, an inode - * number is returned. In this case, IO_agbp would be set to the - * current ag_buf and alloc_done set to false. - * If an allocation needed to be done, xfs_dialloc would return - * the current ag_buf in IO_agbp and set alloc_done to true. - * The caller should then commit the current transaction, allocate a new - * transaction, and call xfs_dialloc() again, passing in the previous - * value of IO_agbp. IO_agbp should be held across the transactions. - * Since the agbp is locked across the two calls, the second call is - * guaranteed to have a free inode available. - * - * Once we successfully pick an inode its number is returned and the - * on-disk data structures are updated. The inode itself is not read - * in, since doing so would break ordering constraints with xfs_reclaim. + * The caller selected an AG for us, and made sure that free inodes are + * available. */ -int -xfs_dialloc( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent inode (directory) */ - umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ - xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ - boolean_t *alloc_done, /* true if we needed to replenish - inode freelist */ - xfs_ino_t *inop) /* inode number allocated */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) { - xfs_agnumber_t agcount; /* number of allocation groups */ - xfs_buf_t *agbp; /* allocation group header's buffer */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header structure */ - xfs_btree_cur_t *cur; /* inode allocation btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ialloced; /* inode allocation status */ - int noroom = 0; /* no space for inode blk allocation */ - xfs_ino_t ino; /* fs-relative inode to be returned */ - /* REFERENCED */ - int j; /* result code */ - xfs_mount_t *mp; /* file system mount structure */ - int offset; /* index of inode in chunk */ - xfs_agino_t pagino; /* parent's AG relative inode # */ - xfs_agnumber_t pagno; /* parent's AG number */ - xfs_inobt_rec_incore_t rec; /* inode allocation record */ - xfs_agnumber_t tagno; /* testing allocation group number */ - xfs_btree_cur_t *tcur; /* temp cursor */ - xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ - struct xfs_perag *pag; - - - if (*IO_agbp == NULL) { - /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. - */ - agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); - /* - * Couldn't find an allocation group satisfying the - * criteria, give up. - */ - if (!agbp) { - *inop = NULLFSINO; - return 0; - } - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } else { - /* - * Continue where we left off before. In this case, we - * know that the allocation group has free inodes. - */ - agbp = *IO_agbp; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - } - mp = tp->t_mountp; - agcount = mp->m_sb.sb_agcount; - agno = be32_to_cpu(agi->agi_seqno); - tagno = agno; - pagno = XFS_INO_TO_AGNO(mp, parent); - pagino = XFS_INO_TO_AGINO(mp, parent); - - /* - * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free - * inode. - */ - - if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { - noroom = 1; - okalloc = 0; - } + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur, *tcur; + struct xfs_inobt_rec_incore rec, trec; + xfs_ino_t ino; + int error; + int offset; + int i, j; - /* - * Loop until we find an allocation group that either has free inodes - * or in which we can allocate some inodes. Iterate through the - * allocation groups upward, wrapping at the end. - */ - *alloc_done = B_FALSE; - while (!agi->agi_freecount) { - /* - * Don't do anything if we're not supposed to allocate - * any blocks, just go on to the next ag. - */ - if (okalloc) { - /* - * Try to allocate some new inodes in the allocation - * group. - */ - if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { - xfs_trans_brelse(tp, agbp); - if (error == ENOSPC) { - *inop = NULLFSINO; - return 0; - } else - return error; - } - if (ialloced) { - /* - * We successfully allocated some inodes, return - * the current context to the caller so that it - * can commit the current transaction and call - * us again where we left off. - */ - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - *alloc_done = B_TRUE; - *IO_agbp = agbp; - *inop = NULLFSINO; - return 0; - } - } - /* - * If it failed, give up on this ag. - */ - xfs_trans_brelse(tp, agbp); - /* - * Go on to the next ag: get its ag header. - */ -nextag: - if (++tagno == agcount) - tagno = 0; - if (tagno == agno) { - *inop = NULLFSINO; - return noroom ? ENOSPC : 0; - } - pag = xfs_perag_get(mp, tagno); - if (pag->pagi_inodeok == 0) { - xfs_perag_put(pag); - goto nextag; - } - error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - xfs_perag_put(pag); - if (error) - goto nextag; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } - /* - * Here with an allocation group that has a free inode. - * Reset agno since we may have chosen a new ag in the - * loop above. - */ - agno = tagno; - *IO_agbp = NULL; pag = xfs_perag_get(mp, agno); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1020,6 +867,158 @@ error0: return error; } +/* + * Allocate an inode on disk. + * + * Mode is used to tell whether the new inode will need space, and whether it + * is a directory. + * + * This function is designed to be called twice if it has to do an allocation + * to make more free inodes. On the first call, *IO_agbp should be set to NULL. + * If an inode is available without having to performn an allocation, an inode + * number is returned. In this case, *IO_agbp would be NULL. If an allocation + * needes to be done, xfs_dialloc would return the current AGI buffer in + * *IO_agbp. The caller should then commit the current transaction, allocate a + * new transaction, and call xfs_dialloc() again, passing in the previous value + * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI + * buffer is locked across the two calls, the second call is guaranteed to have + * a free inode available. + * + * Once we successfully pick an inode its number is returned and the on-disk + * data structures are updated. The inode itself is not read in, since doing so + * would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + struct xfs_trans *tp, + xfs_ino_t parent, + umode_t mode, + int okalloc, + struct xfs_buf **IO_agbp, + boolean_t *alloc_done, + xfs_ino_t *inop) +{ + struct xfs_buf *agbp; + xfs_agnumber_t agno; + struct xfs_agi *agi; + int error; + int ialloced; + int noroom = 0; + struct xfs_mount *mp; + xfs_agnumber_t tagno; + struct xfs_perag *pag; + + if (*IO_agbp == NULL) { + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + /* + * Couldn't find an allocation group satisfying the + * criteria, give up. + */ + if (!agbp) { + *inop = NULLFSINO; + return 0; + } + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + } else { + /* + * Continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + ASSERT(be32_to_cpu(agi->agi_freecount) > 0); + } + mp = tp->t_mountp; + agno = be32_to_cpu(agi->agi_seqno); + tagno = agno; + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + */ + + if (mp->m_maxicount && + mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + *alloc_done = B_FALSE; + while (!agi->agi_freecount) { + /* + * Don't do anything if we're not supposed to allocate + * any blocks, just go on to the next ag. + */ + if (okalloc) { + /* + * Try to allocate some new inodes in the allocation + * group. + */ + if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { + xfs_trans_brelse(tp, agbp); + if (error == ENOSPC) { + *inop = NULLFSINO; + return 0; + } else + return error; + } + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(be32_to_cpu(agi->agi_freecount) > 0); + *alloc_done = B_TRUE; + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + } + /* + * If it failed, give up on this ag. + */ + xfs_trans_brelse(tp, agbp); + /* + * Go on to the next ag: get its ag header. + */ +nextag: + if (++tagno == mp->m_sb.sb_agcount) + tagno = 0; + if (tagno == agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } + pag = xfs_perag_get(mp, tagno); + if (pag->pagi_inodeok == 0) { + xfs_perag_put(pag); + goto nextag; + } + error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); + xfs_perag_put(pag); + if (error) + goto nextag; + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + } + + *IO_agbp = NULL; + return xfs_dialloc_ag(tp, agbp, parent, inop); +} + /* * Free disk inode. Carefully avoids touching the incore inode, all * manipulations incore are the caller's responsibility. -- cgit v1.2.3 From 08358906ed78f6ab4d3ff8e4fd1b87b9a4aea645 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 10:54:47 -0400 Subject: xfs: remove the alloc_done argument to xfs_dialloc We can simplify check the IO_agbp pointer for being non-NULL instead of passing another argument through two layers of function calls. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 3 --- fs/xfs/xfs_ialloc.h | 2 -- fs/xfs/xfs_inode.c | 5 ++--- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_utils.c | 17 +++++++---------- 5 files changed, 10 insertions(+), 19 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a124b9f88aa..2b70952c9d8 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -895,7 +895,6 @@ xfs_dialloc( umode_t mode, int okalloc, struct xfs_buf **IO_agbp, - boolean_t *alloc_done, xfs_ino_t *inop) { struct xfs_buf *agbp; @@ -955,7 +954,6 @@ xfs_dialloc( * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ - *alloc_done = B_FALSE; while (!agi->agi_freecount) { /* * Don't do anything if we're not supposed to allocate @@ -982,7 +980,6 @@ xfs_dialloc( * us again where we left off. */ ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - *alloc_done = B_TRUE; *IO_agbp = agbp; *inop = NULLFSINO; return 0; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 65ac57c8063..1fd6ea4e9c9 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -75,8 +75,6 @@ xfs_dialloc( umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ - boolean_t *alloc_done, /* an allocation was done to replenish - the free inodes */ xfs_ino_t *inop); /* inode number allocated */ /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d48e406de07..5c10825f2f8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -887,7 +887,6 @@ xfs_ialloc( prid_t prid, int okalloc, xfs_buf_t **ialloc_context, - boolean_t *call_again, xfs_inode_t **ipp) { xfs_ino_t ino; @@ -902,10 +901,10 @@ xfs_ialloc( * the on-disk inode to be allocated. */ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, - ialloc_context, call_again, &ino); + ialloc_context, &ino); if (error) return error; - if (*call_again || ino == NULLFSINO) { + if (*ialloc_context || ino == NULLFSINO) { *ipp = NULL; return 0; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c2e2da3abae..04d2fe421b9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -517,7 +517,7 @@ void xfs_inode_free(struct xfs_inode *ip); */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, boolean_t *, xfs_inode_t **); + struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode *); diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 4e5b9ad5cb9..0025c78ac03 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -65,7 +65,6 @@ xfs_dir_ialloc( xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; - boolean_t call_again = B_FALSE; int code; uint log_res; uint log_count; @@ -91,7 +90,7 @@ xfs_dir_ialloc( * the inode(s) that we've just allocated. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &call_again, &ip); + &ialloc_context, &ip); /* * Return an error if we were unable to allocate a new inode. @@ -102,19 +101,18 @@ xfs_dir_ialloc( *ipp = NULL; return code; } - if (!call_again && (ip == NULL)) { + if (!ialloc_context && !ip) { *ipp = NULL; return XFS_ERROR(ENOSPC); } /* - * If call_again is set, then we were unable to get an + * If the AGI buffer is non-NULL, then we were unable to get an * inode in one operation. We need to commit the current * transaction and call xfs_ialloc() again. It is guaranteed * to succeed the second time. */ - if (call_again) { - + if (ialloc_context) { /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -195,7 +193,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &call_again, &ip); + okalloc, &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -206,12 +204,11 @@ xfs_dir_ialloc( *ipp = NULL; return code; } - ASSERT ((!call_again) && (ip != NULL)); + ASSERT(!ialloc_context && ip); } else { - if (committed != NULL) { + if (committed != NULL) *committed = 0; - } } *ipp = ip; -- cgit v1.2.3 From 4bb61069d2019dea2a7e4e0f4432101f03a9b820 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 10:54:48 -0400 Subject: xfs: add a short cut to xfs_dialloc for the non-NULL agbp case In this case we already have selected an AG and know it has free space beause the buffer lock never got released. Jump directly into xfs_dialloc_ag and short cut the AG selection loop. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 2b70952c9d8..7aa8a02b793 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -634,6 +634,10 @@ xfs_dialloc_ag( pag = xfs_perag_get(mp, agno); + ASSERT(pag->pagi_init); + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pagi_freecount > 0); + restart_pagno: cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); /* @@ -907,32 +911,32 @@ xfs_dialloc( xfs_agnumber_t tagno; struct xfs_perag *pag; - if (*IO_agbp == NULL) { - /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. - */ - agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (*IO_agbp) { /* - * Couldn't find an allocation group satisfying the - * criteria, give up. - */ - if (!agbp) { - *inop = NULLFSINO; - return 0; - } - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - } else { - /* - * Continue where we left off before. In this case, we + * If the caller passes in a pointer to the AGI buffer, + * continue where we left off before. In this case, we * know that the allocation group has free inodes. */ agbp = *IO_agbp; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); + goto out_alloc; } + + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + + /* + * Couldn't find an allocation group satisfying the + * criteria, give up. + */ + if (!agbp) { + *inop = NULLFSINO; + return 0; + } + agi = XFS_BUF_TO_AGI(agbp); + mp = tp->t_mountp; agno = be32_to_cpu(agi->agi_seqno); tagno = agno; @@ -1012,6 +1016,7 @@ nextag: ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); } +out_alloc: *IO_agbp = NULL; return xfs_dialloc_ag(tp, agbp, parent, inop); } -- cgit v1.2.3 From 55d6af64cb8bf8c7e9a84b254d2c3479be8c067c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 10:54:49 -0400 Subject: xfs: refactor xfs_ialloc_ag_select Loop over the in-core perag structures and prefer using pagi_freecount over going out to the AGI buffer where possible. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 95 +++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 51 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 7aa8a02b793..ecb9f22a7f3 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -442,14 +442,13 @@ xfs_ialloc_next_ag( * Select an allocation group to look for a free inode in, based on the parent * inode and then mode. Return the allocation group buffer. */ -STATIC xfs_buf_t * /* allocation group buffer */ +STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { - xfs_buf_t *agbp; /* allocation group header buffer */ xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ int flags; /* alloc buffer locking flags */ @@ -459,6 +458,7 @@ xfs_ialloc_ag_select( int needspace; /* file mode implies space allocated */ xfs_perag_t *pag; /* per allocation group data */ xfs_agnumber_t pagno; /* parent (starting) ag number */ + int error; /* * Files of these types need at least one block if length > 0 @@ -474,7 +474,9 @@ xfs_ialloc_ag_select( if (pagno >= agcount) pagno = 0; } + ASSERT(pagno < agcount); + /* * Loop through allocation groups, looking for one with a little * free space in it. Note we don't look for free inodes, exactly. @@ -486,51 +488,45 @@ xfs_ialloc_ag_select( flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + if (!pag->pagi_init) { - if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) goto nextag; - } - } else - agbp = NULL; + } - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto unlock_nextag; + if (pag->pagi_freecount) { + xfs_perag_put(pag); + return agno; } - /* - * Is there enough free space for the file plus a block - * of inodes (if we need to allocate some)? - */ - ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp); - if (ineed && !pag->pagf_init) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + if (!okalloc) + goto nextag; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, agno, flags); + if (error) goto nextag; - } - (void)xfs_alloc_pagf_init(mp, tp, agno, flags); } - if (!ineed || pag->pagf_init) { - if (ineed && !(longest = pag->pagf_longest)) - longest = pag->pagf_flcount > 0; - if (!ineed || - (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed && - okalloc)) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; - goto nextag; - } - xfs_perag_put(pag); - return agbp; - } + + /* + * Is there enough free space for the file plus a block of + * inodes? (if we need to allocate some)? + */ + ineed = XFS_IALLOC_BLOCKS(mp); + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed) { + xfs_perag_put(pag); + return agno; } -unlock_nextag: - if (agbp) - xfs_trans_brelse(tp, agbp); nextag: xfs_perag_put(pag); /* @@ -538,13 +534,13 @@ nextag: * down. */ if (XFS_FORCED_SHUTDOWN(mp)) - return NULL; + return NULLAGNUMBER; agno++; if (agno >= agcount) agno = 0; if (agno == pagno) { if (flags == 0) - return NULL; + return NULLAGNUMBER; flags = 0; } } @@ -901,13 +897,13 @@ xfs_dialloc( struct xfs_buf **IO_agbp, xfs_ino_t *inop) { + struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; xfs_agnumber_t agno; struct xfs_agi *agi; int error; int ialloced; int noroom = 0; - struct xfs_mount *mp; xfs_agnumber_t tagno; struct xfs_perag *pag; @@ -925,20 +921,17 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); - - /* - * Couldn't find an allocation group satisfying the - * criteria, give up. - */ - if (!agbp) { + agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; } + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + return XFS_ERROR(error); agi = XFS_BUF_TO_AGI(agbp); - mp = tp->t_mountp; - agno = be32_to_cpu(agi->agi_seqno); tagno = agno; /* -- cgit v1.2.3 From be60fe54b216a62403b816d3930a66ad7c51cbc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 10:54:50 -0400 Subject: xfs: do not read the AGI buffer in xfs_dialloc until nessecary Refactor the AG selection loop in xfs_dialloc to operate on the in-memory perag data as much as possible. We only read the AGI buffer once we have selected an AG to allocate inodes now instead of for every AG considered. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 127 ++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 58 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ecb9f22a7f3..21e37b55f7e 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -900,11 +900,10 @@ xfs_dialloc( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; xfs_agnumber_t agno; - struct xfs_agi *agi; int error; int ialloced; int noroom = 0; - xfs_agnumber_t tagno; + xfs_agnumber_t start_agno; struct xfs_perag *pag; if (*IO_agbp) { @@ -921,25 +920,17 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); - if (agno == NULLAGNUMBER) { + start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (start_agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; } - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - if (error) - return XFS_ERROR(error); - agi = XFS_BUF_TO_AGI(agbp); - - tagno = agno; - /* * If we have already hit the ceiling of inode blocks then clear * okalloc so we scan all available agi structures for a free * inode. */ - if (mp->m_maxicount && mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { noroom = 1; @@ -951,67 +942,87 @@ xfs_dialloc( * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ - while (!agi->agi_freecount) { - /* - * Don't do anything if we're not supposed to allocate - * any blocks, just go on to the next ag. - */ - if (okalloc) { - /* - * Try to allocate some new inodes in the allocation - * group. - */ - if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { - xfs_trans_brelse(tp, agbp); - if (error == ENOSPC) { - *inop = NULLFSINO; - return 0; - } else - return error; - } - if (ialloced) { - /* - * We successfully allocated some inodes, return - * the current context to the caller so that it - * can commit the current transaction and call - * us again where we left off. - */ - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - *IO_agbp = agbp; - *inop = NULLFSINO; - return 0; - } + agno = start_agno; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto out_error; } + /* - * If it failed, give up on this ag. + * Do a first racy fast path check if this AG is usable. */ - xfs_trans_brelse(tp, agbp); + if (!pag->pagi_freecount && !okalloc) + goto nextag; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + goto out_error; + /* - * Go on to the next ag: get its ag header. + * Once the AGI has been read in we have to recheck + * pagi_freecount with the AGI buffer lock held. */ -nextag: - if (++tagno == mp->m_sb.sb_agcount) - tagno = 0; - if (tagno == agno) { + if (pag->pagi_freecount) { + xfs_perag_put(pag); + goto out_alloc; + } + + if (!okalloc) { + xfs_trans_brelse(tp, agbp); + goto nextag; + } + + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); + if (error) { + xfs_trans_brelse(tp, agbp); + + if (error != ENOSPC) + goto out_error; + + xfs_perag_put(pag); *inop = NULLFSINO; - return noroom ? ENOSPC : 0; + return 0; } - pag = xfs_perag_get(mp, tagno); - if (pag->pagi_inodeok == 0) { + + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(pag->pagi_freecount > 0); xfs_perag_put(pag); - goto nextag; + + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; } - error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); + +nextag: xfs_perag_put(pag); - if (error) - goto nextag; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + if (++agno == mp->m_sb.sb_agcount) + agno = 0; + if (agno == start_agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } } out_alloc: *IO_agbp = NULL; return xfs_dialloc_ag(tp, agbp, parent, inop); +out_error: + xfs_perag_put(pag); + return XFS_ERROR(error); } /* -- cgit v1.2.3 From b373e98daa70d7ddb10f53f81e711c4d17651795 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 11:13:29 -0400 Subject: xfs: clean up xfs_inactive The code to reserve log space and join the inode to the transaction is common for all cases, so don't duplicate it. Also remove the trivial xfs_inactive_symlink_local helper which can simply be opencode now. Signed-off-by: Christoph Hellwig Reviewed-by: Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 171 +++++++++++++------------------------------------- 1 file changed, 43 insertions(+), 128 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c22f4e0ecac..f9a515776a9 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -282,23 +282,15 @@ xfs_inactive_symlink_rmt( * free them all in one bunmapi call. */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - *tpp = NULL; - return error; - } + /* * Lock the inode, fix the size, and join it to the transaction. * Hold it so in the normal path, we still have it locked for * the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ - xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -385,67 +377,15 @@ xfs_inactive_symlink_rmt( ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error0; } - /* - * Return with the inode locked but not joined to the transaction. - */ + + xfs_trans_ijoin(tp, ip, 0); *tpp = tp; return 0; error1: xfs_bmap_cancel(&free_list); error0: - /* - * Have to come here with the inode locked and either - * (held and in the transaction) or (not in the transaction). - * If the inode isn't held then cancel would iput it, but - * that's wrong since this is inactive and the vnode ref - * count is 0 already. - * Cancel won't do anything to the inode if held, but it still - * needs to be locked until the cancel is done, if it was - * joined to the transaction. - */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - *tpp = NULL; return error; - -} - -STATIC int -xfs_inactive_symlink_local( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - int error; - - ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink which fit into - * the inode. Just free the memory used - * to hold the old symlink. - */ - error = xfs_trans_reserve(*tpp, 0, - XFS_ITRUNCATE_LOG_RES(ip->i_mount), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - - if (error) { - xfs_trans_cancel(*tpp, 0); - *tpp = NULL; - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - /* - * Zero length symlinks _can_ exist. - */ - if (ip->i_df.if_bytes > 0) { - xfs_idata_realloc(ip, - -(ip->i_df.if_bytes), - XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - } - return 0; } STATIC int @@ -604,7 +544,7 @@ xfs_inactive( xfs_trans_t *tp; xfs_mount_t *mp; int error; - int truncate; + int truncate = 0; /* * If the inode is already free, then there can be nothing @@ -616,17 +556,6 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - /* - * Only do a truncate if it's a regular file with - * some actual space in it. It's OK to look at the - * inode's fields without the lock because we're the - * only one with a reference to the inode. - */ - truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || - (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && - S_ISREG(ip->i_d.di_mode)); - mp = ip->i_mount; error = 0; @@ -650,72 +579,54 @@ xfs_inactive( goto out; } - ASSERT(ip->i_d.di_nlink == 0); + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; error = xfs_qm_dqattach(ip, 0); if (error) return VN_INACTIVE_CACHE; tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (truncate) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - /* Don't call itruncate_cleanup */ - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return VN_INACTIVE_CACHE; - } + error = xfs_trans_reserve(tp, 0, + (truncate || S_ISLNK(ip->i_d.di_mode)) ? + XFS_ITRUNCATE_LOG_RES(mp) : + XFS_IFREE_LOG_RES(mp), + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return VN_INACTIVE_CACHE; + } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + if (S_ISLNK(ip->i_d.di_mode)) { + /* + * Zero length symlinks _can_ exist. + */ + if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) { + error = xfs_inactive_symlink_rmt(ip, &tp); + if (error) + goto out_cancel; + } else if (ip->i_df.if_bytes > 0) { + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + } + } else if (truncate) { ip->i_d.di_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); - if (error) { - xfs_trans_cancel(tp, - XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - return VN_INACTIVE_CACHE; - } + if (error) + goto out_cancel; ASSERT(ip->i_d.di_nextents == 0); - } else if (S_ISLNK(ip->i_d.di_mode)) { - - /* - * If we get an error while cleaning up a - * symlink we bail out. - */ - error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ? - xfs_inactive_symlink_rmt(ip, &tp) : - xfs_inactive_symlink_local(ip, &tp); - - if (error) { - ASSERT(tp == NULL); - return VN_INACTIVE_CACHE; - } - - xfs_trans_ijoin(tp, ip, 0); - } else { - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - return VN_INACTIVE_CACHE; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); } /* @@ -781,7 +692,11 @@ xfs_inactive( xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - out: +out: + return VN_INACTIVE_CACHE; +out_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } -- cgit v1.2.3 From fe67be036ff2f713b1c5f24dd4cdffae75bcb97a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 11:13:30 -0400 Subject: xfs: remove xfs_inactive_attrs Remove this helper as the code flow is a lot more obvious when it gets merged into its only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 97 +++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 61 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f9a515776a9..9a2ae8c0ecc 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -388,54 +388,6 @@ xfs_inactive_symlink_rmt( return error; } -STATIC int -xfs_inactive_attrs( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_trans_t *tp; - int error; - xfs_mount_t *mp; - - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_forkoff != 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_unlock; - - error = xfs_attr_inactive(ip); - if (error) - goto error_unlock; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) - goto error_cancel; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - ASSERT(ip->i_d.di_anextents == 0); - - *tpp = tp; - return 0; - -error_cancel: - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); -error_unlock: - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; -} - int xfs_release( xfs_inode_t *ip) @@ -630,24 +582,40 @@ xfs_inactive( } /* - * If there are attributes associated with the file - * then blow them away now. The code calls a routine - * that recursively deconstructs the attribute fork. - * We need to just commit the current transaction + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. We need to just commit the current transaction * because we can't use it for xfs_attr_inactive(). */ if (ip->i_d.di_anextents > 0) { - error = xfs_inactive_attrs(ip, &tp); - /* - * If we got an error, the transaction is already - * cancelled, and the inode is unlocked. Just get out. - */ - if (error) - return VN_INACTIVE_CACHE; - } else if (ip->i_afp) { - xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ASSERT(ip->i_d.di_forkoff != 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto error_unlock; + + error = xfs_attr_inactive(ip); + if (error) + goto error_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, 0, + XFS_IFREE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); + if (error) + goto error_cancel; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); } + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + ASSERT(ip->i_d.di_anextents == 0); + /* * Free the inode. */ @@ -698,6 +666,13 @@ out_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; + +error_cancel: + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); +error_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return VN_INACTIVE_CACHE; } /* -- cgit v1.2.3 From 0b56185b0d64ef89dad1c85bb7403fa762cbe50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 11:13:31 -0400 Subject: xfs: do not take the iolock in xfs_inactive An inode that enters xfs_inactive has been removed from all global lists but the inode hash, and can't be recycled in xfs_iget before it has been marked reclaimable. Thus taking the iolock in here is not nessecary at all, and given the amount of lockdep false positives it has triggered already I'd rather remove the locking. The only change outside of xfs_inactive is relaxing an assert in xfs_itruncate_extents. Signed-off-by: Christoph Hellwig Reviewed-by: Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 4 +++- fs/xfs/xfs_vnodeops.c | 29 ++++++++++++----------------- 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5c10825f2f8..2778258fcfa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1123,7 +1123,9 @@ xfs_itruncate_extents( int error = 0; int done = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 9a2ae8c0ecc..79270430daf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -554,7 +554,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); if (S_ISLNK(ip->i_d.di_mode)) { @@ -591,21 +591,24 @@ xfs_inactive( ASSERT(ip->i_d.di_forkoff != 0); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - goto error_unlock; + goto out_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_attr_inactive(ip); if (error) - goto error_unlock; + goto out; tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_INACTIVE_LOG_COUNT); - if (error) - goto error_cancel; + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -658,21 +661,13 @@ xfs_inactive( * Release the dquots held by inode, if any. */ xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); out: return VN_INACTIVE_CACHE; out_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - return VN_INACTIVE_CACHE; - -error_cancel: - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); -error_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return VN_INACTIVE_CACHE; + goto out_unlock; } /* -- cgit v1.2.3 From 5a15322da1a51ad8f3af1962de355885b6c606f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 11:13:32 -0400 Subject: xfs: avoid the iolock in xfs_free_eofblocks for evicted inodes Same rational as the last patch - these inodes are not reachable, so don't bother with locking. Signed-off-by: Christoph Hellwig Reviewed-by: Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 79270430daf..2a5c637344b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -145,11 +145,6 @@ xfs_readlink( return error; } -/* - * Flags for xfs_free_eofblocks - */ -#define XFS_FREE_EOF_TRYLOCK (1<<0) - /* * This is called by xfs_inactive to free any blocks beyond eof * when the link count isn't zero and by xfs_dm_punch_hole() when @@ -159,7 +154,7 @@ STATIC int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, - int flags) + bool need_iolock) { xfs_trans_t *tp; int error; @@ -201,13 +196,11 @@ xfs_free_eofblocks( */ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (flags & XFS_FREE_EOF_TRYLOCK) { + if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); return 0; } - } else { - xfs_ilock(ip, XFS_IOLOCK_EXCL); } error = xfs_trans_reserve(tp, 0, @@ -217,7 +210,8 @@ xfs_free_eofblocks( if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -244,7 +238,10 @@ xfs_free_eofblocks( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); } - xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); } return error; } @@ -466,8 +463,7 @@ xfs_release( if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) return 0; - error = xfs_free_eofblocks(mp, ip, - XFS_FREE_EOF_TRYLOCK); + error = xfs_free_eofblocks(mp, ip, true); if (error) return error; @@ -524,7 +520,7 @@ xfs_inactive( (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || ip->i_delayed_blks != 0))) { - error = xfs_free_eofblocks(mp, ip, 0); + error = xfs_free_eofblocks(mp, ip, false); if (error) return VN_INACTIVE_CACHE; } -- cgit v1.2.3 From 4f59af758f9092bc7b266ca919ce6067170e5172 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Jul 2012 11:13:33 -0400 Subject: xfs: remove iolock lock classes Content-Disposition: inline; filename=xfs-remove-iolock-classes Now that we never take the iolock during inode reclaim we don't need to play games with lock classes. Signed-off-by: Christoph Hellwig Reviewed-by: Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_iget.c | 15 --------------- fs/xfs/xfs_inode.h | 2 -- fs/xfs/xfs_super.c | 18 ++---------------- 3 files changed, 2 insertions(+), 33 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 1bb4365e8c2..784a803383e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -40,17 +40,6 @@ #include "xfs_trace.h" -/* - * Define xfs inode iolock lockdep classes. We need to ensure that all active - * inodes are considered the same for lockdep purposes, including inodes that - * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to - * guarantee the locks are considered the same when there are multiple lock - * initialisation siteÑ•. Also, define a reclaimable inode class so it is - * obvious in lockdep reports which class the report is against. - */ -static struct lock_class_key xfs_iolock_active; -struct lock_class_key xfs_iolock_reclaimable; - /* * Allocate and initialise an xfs_inode. */ @@ -80,8 +69,6 @@ xfs_inode_alloc( ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_active, "xfs_iolock_active"); /* initialise the xfs inode */ ip->i_ino = ino; @@ -250,8 +237,6 @@ xfs_iget_cache_hit( ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_active, "xfs_iolock_active"); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 04d2fe421b9..94b32f906e7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -487,8 +487,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) -extern struct lock_class_key xfs_iolock_reclaimable; - /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index cb2deb13b06..bdaf4cb9f4a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -874,6 +874,8 @@ xfs_fs_evict_inode( { xfs_inode_t *ip = XFS_I(inode); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); @@ -882,22 +884,6 @@ xfs_fs_evict_inode( XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); - /* - * The iolock is used by the file system to coordinate reads, - * writes, and block truncates. Up to this point the lock - * protected concurrent accesses by users of the inode. But - * from here forward we're doing some final processing of the - * inode because we're done with it, and although we reuse the - * iolock for protection it is really a distinct lock class - * (in the lockdep sense) from before. To keep lockdep happy - * (and basically indicate what we are doing), we explicitly - * re-init the iolock here. - */ - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - lockdep_set_class_and_name(&ip->i_iolock.mr_lock, - &xfs_iolock_reclaimable, "xfs_iolock_reclaimable"); - xfs_inactive(ip); } -- cgit v1.2.3 From 8375f922aaa6e7a880022529202fb486315568c3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 28 Jun 2012 06:52:56 -0400 Subject: xfs: re-enable xfsaild idle mode and fix associated races xfsaild idle mode logic currently leads to a couple hangs: 1.) If xfsaild is rescheduled in during an incremental scan (i.e., tout != 0) and the target has been updated since the previous run, we can hit the new target and go into idle mode with a still populated ail. 2.) A wake up is only issued when the target is pushed forward. The wake up can race with xfsaild if it is currently in the process of entering idle mode, causing future wake up events to be lost. These hangs have been reproduced and verified as fixed by running xfstests 273 in a loop on a slightly modified upstream kernel. The kernel is modified to re-enable idle mode as previously implemented (when count == 0) and with a revert of commit 670ce93f, which includes performance improvements that make this harder to reproduce. The solution, the algorithm for which has been outlined by Dave Chinner, is to modify xfsaild to enter idle mode only when the ail is empty and the push target has not been moved forward since the last push. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_trans_ail.c | 35 ++++++++++++++++++++++++++++++++--- fs/xfs/xfs_trans_priv.h | 1 + 2 files changed, 33 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9c514483e59..6011ee66133 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -383,6 +383,12 @@ xfsaild_push( } spin_lock(&ailp->xa_lock); + + /* barrier matches the xa_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->xa_target; + ailp->xa_target_prev = target; + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); if (!lip) { /* @@ -397,7 +403,6 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail); lsn = lip->li_lsn; - target = ailp->xa_target; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; @@ -527,8 +532,32 @@ xfsaild( __set_current_state(TASK_KILLABLE); else __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(tout ? - msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + spin_lock(&ailp->xa_lock); + + /* + * Idle if the AIL is empty and we are not racing with a target + * update. We check the AIL after we set the task to a sleep + * state to guarantee that we either catch an xa_target update + * or that a wake_up resets the state to TASK_RUNNING. + * Otherwise, we run the risk of sleeping indefinitely. + * + * The barrier matches the xa_target update in xfs_ail_push(). + */ + smp_rmb(); + if (!xfs_ail_min(ailp) && + ailp->xa_target == ailp->xa_target_prev) { + spin_unlock(&ailp->xa_lock); + schedule(); + tout = 0; + continue; + } + spin_unlock(&ailp->xa_lock); + + if (tout) + schedule_timeout(msecs_to_jiffies(tout)); + + __set_current_state(TASK_RUNNING); try_to_freeze(); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index fb62377d1cb..53b7c9b0f8f 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -67,6 +67,7 @@ struct xfs_ail { struct task_struct *xa_task; struct list_head xa_ail; xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; struct list_head xa_cursors; spinlock_t xa_lock; xfs_lsn_t xa_last_pushed_lsn; -- cgit v1.2.3 From 9a57fa8ee7c29e11c2a29ce058573ba99157eda7 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 24 Jul 2012 10:59:19 -0500 Subject: xfs: wait for the write the superblock on unmount v2: Add the xfs_buf_lock to xfs_quiesce_attr(). Add explaination why xfs_buf_lock() is used to wait for write. xfs_wait_buftarg() does not wait for the completion of the write of the uncached superblock. This write can race with the shutdown of the log and causes a panic if the write does not win the race. During the log write, xfsaild_push() will lock the buffer and set the XBF_ASYNC flag. Because the XBF_FLAG is set, complete() is not performed on the buffer's iowait entry, we cannot call xfs_buf_iowait() to wait for the write to complete. The buffer's lock is held until the write is complete, so we can block on a xfs_buf_lock() request to be notified that the write is complete. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 9 +++++++++ fs/xfs/xfs_sync.c | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9536fd19019..711ca51ca3d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1529,6 +1529,15 @@ xfs_unmountfs( xfs_ail_push_all_sync(mp->m_ail); xfs_wait_buftarg(mp->m_ddev_targp); + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index e61fc151907..97304f10e78 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -359,6 +359,15 @@ xfs_quiesce_attr( * added an item to the AIL, thus flush it again. */ xfs_ail_push_all_sync(mp->m_ail); + + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); } static void -- cgit v1.2.3 From d9457dc056249913a7abe8b71dc09e427e590e35 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 12 Jun 2012 16:20:39 +0200 Subject: xfs: Convert to new freezing code Generic code now blocks all writers from standard write paths. So we add blocking of all writers coming from ioctl (we get a protection of ioctl against racing remount read-only as a bonus) and convert xfs_file_aio_write() to a non-racy freeze protection. We also keep freeze protection on transaction start to block internal filesystem writes such as removal of preallocated blocks. CC: Ben Myers CC: Alex Elder CC: xfs@oss.sgi.com Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/xfs/xfs_aops.c | 18 +++++++++++++++++ fs/xfs/xfs_file.c | 10 +++++++--- fs/xfs/xfs_ioctl.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--- fs/xfs/xfs_ioctl32.c | 12 ++++++++++++ fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 3 --- fs/xfs/xfs_sync.c | 2 +- fs/xfs/xfs_trans.c | 17 +++++++++++++--- fs/xfs/xfs_trans.h | 2 ++ 10 files changed, 109 insertions(+), 16 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8dad722c004..daa42383ebd 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -123,6 +123,12 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; + /* + * We will pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -199,6 +205,15 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (ioend->io_append_trans) { + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + rwsem_acquire_read( + &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -1410,6 +1425,9 @@ out_trans_cancel: if (ioend->io_append_trans) { current_set_flags_nested(&ioend->io_append_trans->t_pflags, PF_FSTRANS); + rwsem_acquire_read( + &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_trans_cancel(ioend->io_append_trans, 0); } out_destroy_ioend: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9f7ec15a652..f0081f20e5c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -781,10 +781,12 @@ xfs_file_aio_write( if (ocount == 0) return 0; - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ret = -EIO; + goto out; + } if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); @@ -803,6 +805,8 @@ xfs_file_aio_write( ret = err; } +out: + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f1535d25a9..0e0232c3b6d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -364,9 +364,15 @@ xfs_fssetdm_by_handle( if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(parfilp); + if (error) + return error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); return PTR_ERR(dentry); + } if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); @@ -382,6 +388,7 @@ xfs_fssetdm_by_handle( fsd.fsd_dmstate); out: + mnt_drop_write_file(parfilp); dput(dentry); return error; } @@ -634,7 +641,11 @@ xfs_ioc_space( if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + mnt_drop_write_file(filp); return -error; } @@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr( { struct fsxattr fa; unsigned int mask; + int error; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; @@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) mask |= FSX_NONBLOCK; - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1196,6 +1213,7 @@ xfs_ioc_setxflags( struct fsxattr fa; unsigned int flags; unsigned int mask; + int error; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -1210,7 +1228,12 @@ xfs_ioc_setxflags( mask |= FSX_NONBLOCK; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1385,8 +1408,13 @@ xfs_file_ioctl( if (copy_from_user(&dmi, arg, sizeof(dmi))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, dmi.fsd_dmstate); + mnt_drop_write_file(filp); return -error; } @@ -1434,7 +1462,11 @@ xfs_file_ioctl( if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } @@ -1463,9 +1495,14 @@ xfs_file_ioctl( if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + /* input parameter is passed in resblks field of structure */ in = inout.resblks; error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); if (error) return -error; @@ -1496,7 +1533,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1506,7 +1547,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1516,7 +1561,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4f2da0d2bf..1244274a567 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -600,7 +600,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_data_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSGROWFSRT_32: { @@ -608,7 +612,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_rt_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } #endif @@ -627,7 +635,11 @@ xfs_file_compat_ioctl( offsetof(struct xfs_swapext, sx_stat)) || xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSBULKSTAT_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index aadfce6681e..b3b9b26091a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -680,9 +680,9 @@ xfs_iomap_write_unwritten( * the same inode that we complete here and might deadlock * on the iolock. */ - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE; + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 536021fb3d4..b09a4a7eb64 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1544,7 +1544,7 @@ xfs_unmountfs( int xfs_fs_writable(xfs_mount_t *mp) { - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 90c1fc9eaea..c6bca0d92cb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -314,9 +314,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ -#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) - /* * Flags for xfs_mountfs */ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 1e9ee064dbb..0b9feacdcd1 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -394,7 +394,7 @@ xfs_sync_worker( if (!(mp->m_super->s_flags & MS_ACTIVE) && !(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdf324508c5..06ed520a767 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -576,8 +576,12 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type, KM_SLEEP); + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; } xfs_trans_t * @@ -588,6 +592,7 @@ _xfs_trans_alloc( { xfs_trans_t *tp; + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, memflags); @@ -611,6 +616,8 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); } @@ -643,7 +650,11 @@ xfs_trans_dup( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7c37b533aa8..19c17428287 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -179,6 +179,8 @@ struct xfs_log_item_desc { #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ /* * Values for call flags parameter. -- cgit v1.2.3 From a76cccbeef9e91b5f799e8853acac1ed1fc833cb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 31 Jul 2012 14:55:51 +1000 Subject: xfs: fix uninitialised variable in xfs_rtbuf_get() Results in this assert failure in generic/090: XFS: Assertion failed: *nmap >= 1, file: fs/xfs/xfs_bmap.c, line: 4363 ..... Call Trace: [] xfs_bmapi_read+0x6b/0x370 [] xfs_rtbuf_get+0x42/0x130 [] xfs_rtget_summary+0x89/0x120 [] xfs_rtallocate_extent_size+0xce/0x340 [] xfs_rtallocate_extent+0x240/0x290 [] xfs_bmap_rtalloc+0x1ba/0x340 [] xfs_bmap_alloc+0x35/0x40 [] xfs_bmapi_allocate+0xf1/0x350 [] xfs_bmapi_write+0x66e/0xa60 [] xfs_iomap_write_direct+0x22a/0x3f0 [] __xfs_get_blocks+0x38b/0x5d0 [] xfs_get_blocks_direct+0x14/0x20 [] do_blockdev_direct_IO+0xf71/0x1eb0 [] __blockdev_direct_IO+0x55/0x60 [] xfs_vm_direct_IO+0x11a/0x1e0 [] generic_file_direct_write+0xd7/0x1b0 [] xfs_file_dio_aio_write+0x13c/0x320 [] xfs_file_aio_write+0x1c2/0x1d0 [] do_sync_write+0xa7/0xe0 [] vfs_write+0xa8/0x160 [] sys_pwrite64+0x92/0xb0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_rtalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 92d4331cd4f..ca28a4ba4b5 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -857,7 +857,7 @@ xfs_rtbuf_get( xfs_buf_t *bp; /* block buffer, result */ xfs_inode_t *ip; /* bitmap or summary inode */ xfs_bmbt_irec_t map; - int nmap; + int nmap = 1; int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; -- cgit v1.2.3 From 1ed845df60f3f02d4b7cd9fcad79ccb69c289f5c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 Aug 2012 09:56:49 -0500 Subject: xfs: kill struct declarations in xfs_mount.h I noticed that "struct xfs_mount_args" was still declared in "fs/xfs/xfs_mount.h". That struct doesn't even exist any more (and is obviously not referenced elsewhere in that header file). While in there, delete four other unneeded struct declarations in that file. Doing so highlights that "fs/xfs/xfs_trace.h" was relying indirectly on "xfs_mount.h" to be #included in order to declare "struct xfs_bmbt_irec", so add that declaration to resolve that issue. Signed-off-by: Alex Elder Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.h | 5 ----- fs/xfs/xfs_trace.h | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 05a05a7b611..deee09e534d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -54,12 +54,7 @@ typedef struct xfs_trans_reservations { #include "xfs_sync.h" struct xlog; -struct xfs_mount_args; struct xfs_inode; -struct xfs_bmbt_irec; -struct xfs_bmap_free; -struct xfs_extdelta; -struct xfs_swapext; struct xfs_mru_cache; struct xfs_nameops; struct xfs_ail; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e5795dd6013..7d36ccf57f9 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,6 +37,7 @@ struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; struct xfs_inode_log_format; +struct xfs_bmbt_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), -- cgit v1.2.3 From c4982110ae93d7575503feb81d15e93c0c5f393c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Aug 2012 02:02:02 -0400 Subject: xfs: unlock the AGI buffer when looping in xfs_dialloc Also update some commens in the area to make the code easier to read. Signed-off-by: Christoph Hellwig Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 21e37b55f7e..5aceb3f8ecd 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -962,23 +962,22 @@ xfs_dialloc( if (!pag->pagi_freecount && !okalloc) goto nextag; + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); if (error) goto out_error; - /* - * Once the AGI has been read in we have to recheck - * pagi_freecount with the AGI buffer lock held. - */ if (pag->pagi_freecount) { xfs_perag_put(pag); goto out_alloc; } - if (!okalloc) { - xfs_trans_brelse(tp, agbp); - goto nextag; - } + if (!okalloc) + goto nextag_relse_buffer; + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); if (error) { @@ -1007,6 +1006,8 @@ xfs_dialloc( return 0; } +nextag_relse_buffer: + xfs_trans_brelse(tp, agbp); nextag: xfs_perag_put(pag); if (++agno == mp->m_sb.sb_agcount) -- cgit v1.2.3 From 643bfc061c47e9c7661324a09fb0a0bc6601e5d6 Mon Sep 17 00:00:00 2001 From: Tomas Racek Date: Tue, 14 Aug 2012 10:35:04 +0200 Subject: xfs: check for possible overflow in xfs_ioc_trim If range.start or range.minlen is bigger than filesystem size, return invalid value error. This fixes possible overflow in BTOBB macro when passed value was nearly ULLONG_MAX. Signed-off-by: Tomas Racek Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_discard.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index f9c3fe304a1..69cf4fcde03 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -179,12 +179,14 @@ xfs_ioc_trim( * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ + if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + return -XFS_ERROR(EINVAL); + start = BTOBB(range.start); end = start + BTOBBT(range.len) - 1; minlen = BTOBB(max_t(u64, granularity, range.minlen)); - if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks) - return -XFS_ERROR(EINVAL); if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; -- cgit v1.2.3 From 43829731dd372d04d6706c51052b9dabab9ca356 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Aug 2012 14:51:24 -0700 Subject: workqueue: deprecate flush[_delayed]_work_sync() flush[_delayed]_work_sync() are now spurious. Mark them deprecated and convert all users to flush[_delayed]_work(). If you're cc'd and wondering what's going on: Now all workqueues are non-reentrant and the regular flushes guarantee that the work item is not pending or running on any CPU on return, so there's no reason to use the sync flushes at all and they're going away. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Cc: Russell King Cc: Paul Mundt Cc: Ian Campbell Cc: Jens Axboe Cc: Mattia Dongili Cc: Kent Yoder Cc: David Airlie Cc: Jiri Kosina Cc: Karsten Keil Cc: Bryan Wu Cc: Benjamin Herrenschmidt Cc: Alasdair Kergon Cc: Mauro Carvalho Chehab Cc: Florian Tobias Schandinat Cc: David Woodhouse Cc: "David S. Miller" Cc: linux-wireless@vger.kernel.org Cc: Anton Vorontsov Cc: Sangbeom Kim Cc: "James E.J. Bottomley" Cc: Greg Kroah-Hartman Cc: Eric Van Hensbergen Cc: Takashi Iwai Cc: Steven Whitehouse Cc: Petr Vandrovec Cc: Mark Fasheh Cc: Christoph Hellwig Cc: Avi Kivity --- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_sync.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bdaf4cb9f4a..e8e6be439bc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -953,7 +953,7 @@ xfs_fs_sync_fs( * We schedule xfssyncd now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work_sync(&mp->m_sync_work); + flush_delayed_work(&mp->m_sync_work); } return 0; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 96548176db8..9500caf15ac 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -475,7 +475,7 @@ xfs_flush_inodes( struct xfs_mount *mp = ip->i_mount; queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work_sync(&mp->m_flush_work); + flush_work(&mp->m_flush_work); } STATIC void -- cgit v1.2.3 From 0b9e3f6d84ce619f697bb622d9165cccaa93d67c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 31 Jul 2012 14:55:51 +1000 Subject: xfs: fix uninitialised variable in xfs_rtbuf_get() Results in this assert failure in generic/090: XFS: Assertion failed: *nmap >= 1, file: fs/xfs/xfs_bmap.c, line: 4363 ..... Call Trace: [] xfs_bmapi_read+0x6b/0x370 [] xfs_rtbuf_get+0x42/0x130 [] xfs_rtget_summary+0x89/0x120 [] xfs_rtallocate_extent_size+0xce/0x340 [] xfs_rtallocate_extent+0x240/0x290 [] xfs_bmap_rtalloc+0x1ba/0x340 [] xfs_bmap_alloc+0x35/0x40 [] xfs_bmapi_allocate+0xf1/0x350 [] xfs_bmapi_write+0x66e/0xa60 [] xfs_iomap_write_direct+0x22a/0x3f0 [] __xfs_get_blocks+0x38b/0x5d0 [] xfs_get_blocks_direct+0x14/0x20 [] do_blockdev_direct_IO+0xf71/0x1eb0 [] __blockdev_direct_IO+0x55/0x60 [] xfs_vm_direct_IO+0x11a/0x1e0 [] generic_file_direct_write+0xd7/0x1b0 [] xfs_file_dio_aio_write+0x13c/0x320 [] xfs_file_aio_write+0x1c2/0x1d0 [] do_sync_write+0xa7/0xe0 [] vfs_write+0xa8/0x160 [] sys_pwrite64+0x92/0xb0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_rtalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 92d4331cd4f..ca28a4ba4b5 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -857,7 +857,7 @@ xfs_rtbuf_get( xfs_buf_t *bp; /* block buffer, result */ xfs_inode_t *ip; /* bitmap or summary inode */ xfs_bmbt_irec_t map; - int nmap; + int nmap = 1; int error; /* error value */ ip = issum ? mp->m_rsumip : mp->m_rbmip; -- cgit v1.2.3 From 761290309939743ddf97e2bd94c6da18c6436b79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Aug 2012 02:02:02 -0400 Subject: xfs: unlock the AGI buffer when looping in xfs_dialloc Also update some commens in the area to make the code easier to read. Signed-off-by: Christoph Hellwig Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 21e37b55f7e..5aceb3f8ecd 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -962,23 +962,22 @@ xfs_dialloc( if (!pag->pagi_freecount && !okalloc) goto nextag; + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); if (error) goto out_error; - /* - * Once the AGI has been read in we have to recheck - * pagi_freecount with the AGI buffer lock held. - */ if (pag->pagi_freecount) { xfs_perag_put(pag); goto out_alloc; } - if (!okalloc) { - xfs_trans_brelse(tp, agbp); - goto nextag; - } + if (!okalloc) + goto nextag_relse_buffer; + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); if (error) { @@ -1007,6 +1006,8 @@ xfs_dialloc( return 0; } +nextag_relse_buffer: + xfs_trans_brelse(tp, agbp); nextag: xfs_perag_put(pag); if (++agno == mp->m_sb.sb_agcount) -- cgit v1.2.3 From a672e1be30d5bc848cd0067c55ed29b2015b7c17 Mon Sep 17 00:00:00 2001 From: Tomas Racek Date: Tue, 14 Aug 2012 10:35:04 +0200 Subject: xfs: check for possible overflow in xfs_ioc_trim If range.start or range.minlen is bigger than filesystem size, return invalid value error. This fixes possible overflow in BTOBB macro when passed value was nearly ULLONG_MAX. Signed-off-by: Tomas Racek Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_discard.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index f9c3fe304a1..69cf4fcde03 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -179,12 +179,14 @@ xfs_ioc_trim( * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ + if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + return -XFS_ERROR(EINVAL); + start = BTOBB(range.start); end = start + BTOBBT(range.len) - 1; minlen = BTOBB(max_t(u64, granularity, range.minlen)); - if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks) - return -XFS_ERROR(EINVAL); if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; -- cgit v1.2.3 From e599b3253c5e49f7a2a579eef2fc2aa066989ef5 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 10 Aug 2012 15:01:51 -0300 Subject: xfs: fix race while discarding buffers [V4] While xfs_buftarg_shrink() is freeing buffers from the dispose list (filled with buffers from lru list), there is a possibility to have xfs_buf_stale() racing with it, and removing buffers from dispose list before xfs_buftarg_shrink() does it. This happens because xfs_buftarg_shrink() handle the dispose list without locking and the test condition in xfs_buf_stale() checks for the buffer being in *any* list: if (!list_empty(&bp->b_lru)) If the buffer happens to be on dispose list, this causes the buffer counter of lru list (btp->bt_lru_nr) to be decremented twice (once in xfs_buftarg_shrink() and another in xfs_buf_stale()) causing a wrong account usage of the lru list. This may cause xfs_buftarg_shrink() to return a wrong value to the memory shrinker shrink_slab(), and such account error may also cause an underflowed value to be returned; since the counter is lower than the current number of items in the lru list, a decrement may happen when the counter is 0, causing an underflow on the counter. The fix uses a new flag field (and a new buffer flag) to serialize buffer handling during the shrink process. The new flag field has been designed to use btp->bt_lru_lock/unlock instead of xfs_buf_lock/unlock mechanism. dchinner, sandeen, aquini and aris also deserve credits for this. Signed-off-by: Carlos Maiolino Reviewed-by: Ben Myers Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 5 ++++- fs/xfs/xfs_buf.h | 41 ++++++++++++++++++++++++----------------- 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d7a9dd735e1..933b7930b86 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -96,6 +96,7 @@ xfs_buf_lru_add( atomic_inc(&bp->b_hold); list_add_tail(&bp->b_lru, &btp->bt_lru); btp->bt_lru_nr++; + bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); } @@ -154,7 +155,8 @@ xfs_buf_stale( struct xfs_buftarg *btp = bp->b_target; spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { + if (!list_empty(&bp->b_lru) && + !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { list_del_init(&bp->b_lru); btp->bt_lru_nr--; atomic_dec(&bp->b_hold); @@ -1501,6 +1503,7 @@ xfs_buftarg_shrink( */ list_move(&bp->b_lru, &dispose); btp->bt_lru_nr--; + bp->b_lru_flags |= _XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d03b73b9604..7c0b6a0a155 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -38,27 +38,28 @@ typedef enum { XBRW_ZERO = 3, /* Zero target memory */ } xfs_buf_rw_t; -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ -#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ -#define XBF_FUA (1 << 11)/* force cache write through mode */ -#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ -#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t; { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } typedef struct xfs_buftarg { dev_t bt_dev; @@ -124,7 +126,12 @@ typedef struct xfs_buf { xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ struct list_head b_lru; /* lru list */ + xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ -- cgit v1.2.3 From 834ab12228fad777a11007a24cb6286b02c9a41c Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 21 Aug 2012 17:11:45 +0800 Subject: xfs: Remove type argument from xfs_seek_data()/xfs_seek_hole() The type is already indicated by the function naming explicitly, so this argument can be omitted from those calls. Signed-off-by: Jie Liu Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 56afcdb2377..92ba18f841f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -962,8 +962,7 @@ xfs_vm_page_mkwrite( STATIC loff_t xfs_seek_data( struct file *file, - loff_t start, - u32 type) + loff_t start) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); @@ -1029,8 +1028,7 @@ out_unlock: STATIC loff_t xfs_seek_hole( struct file *file, - loff_t start, - u32 type) + loff_t start) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); @@ -1092,9 +1090,9 @@ xfs_file_llseek( case SEEK_SET: return generic_file_llseek(file, offset, origin); case SEEK_DATA: - return xfs_seek_data(file, offset, origin); + return xfs_seek_data(file, offset); case SEEK_HOLE: - return xfs_seek_hole(file, offset, origin); + return xfs_seek_hole(file, offset); default: return -EINVAL; } -- cgit v1.2.3 From d126d43f631f996daeee5006714fed914be32368 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 21 Aug 2012 17:11:57 +0800 Subject: xfs: Introduce a helper routine to probe data or hole offset from page cache Introduce helpers to probe data or hole offset from page cache. Signed-off-by: Jie Liu Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 92ba18f841f..d78a746b6c7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include #include +#include static const struct vm_operations_struct xfs_file_vm_ops; @@ -959,6 +960,224 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +/* + * This type is designed to indicate the type of offset we would like + * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). + */ +enum { + HOLE_OFF = 0, + DATA_OFF, +}; + +/* + * Lookup the desired type of offset from the given page. + * + * On success, return true and the offset argument will point to the + * start of the region that was found. Otherwise this function will + * return false and keep the offset argument unchanged. + */ +STATIC bool +xfs_lookup_buffer_offset( + struct page *page, + loff_t *offset, + unsigned int type) +{ + loff_t lastoff = page_offset(page); + bool found = false; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + /* + * Unwritten extents that have data in the page + * cache covering them can be identified by the + * BH_Unwritten state flag. Pages with multiple + * buffers might have a mix of holes, data and + * unwritten extents - any buffer with valid + * data in it should have BH_Uptodate flag set + * on it. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + if (type == DATA_OFF) + found = true; + } else { + if (type == HOLE_OFF) + found = true; + } + + if (found) { + *offset = lastoff; + break; + } + lastoff += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + return found; +} + +/* + * This routine is called to find out and return a data or hole offset + * from the page cache for unwritten extents according to the desired + * type for xfs_seek_data() or xfs_seek_hole(). + * + * The argument offset is used to tell where we start to search from the + * page cache. Map is used to figure out the end points of the range to + * lookup pages. + * + * Return true if the desired type of offset was found, and the argument + * offset is filled with that address. Otherwise, return false and keep + * offset unchanged. + */ +STATIC bool +xfs_find_get_desired_pgoff( + struct inode *inode, + struct xfs_bmbt_irec *map, + unsigned int type, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff = *offset; + loff_t lastoff = startoff; + bool found = false; + + pagevec_init(&pvec, 0); + + index = startoff >> PAGE_CACHE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); + end = endoff >> PAGE_CACHE_SHIFT; + do { + int want; + unsigned nr_pages; + unsigned int i; + + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* + * No page mapped into given range. If we are searching holes + * and if this is the first time we got into the loop, it means + * that the given offset is landed in a hole, return it. + * + * If we have already stepped through some block buffers to find + * holes but they all contains data. In this case, the last + * offset is already updated and pointed to the end of the last + * mapped page, if it does not reach the endpoint to search, + * that means there should be a hole between them. + */ + if (nr_pages == 0) { + /* Data search found nothing */ + if (type == DATA_OFF) + break; + + ASSERT(type == HOLE_OFF); + if (lastoff == startoff || lastoff < endoff) { + found = true; + *offset = lastoff; + } + break; + } + + /* + * At lease we found one page. If this is the first time we + * step into the loop, and if the first page index offset is + * greater than the given search offset, a hole was found. + */ + if (type == HOLE_OFF && lastoff == startoff && + lastoff < page_offset(pvec.pages[0])) { + found = true; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to tmpfs + * file mapping. However, page->index will not change + * because we have a reference on the page. + * + * Searching done if the page index is out of range. + * If the current offset is not reaches the end of + * the specified search range, there should be a hole + * between them. + */ + if (page->index > end) { + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } + goto out; + } + + lock_page(page); + /* + * Page truncated or invalidated(page->mapping == NULL). + * We can freely skip it and proceed to check the next + * page. + */ + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + found = xfs_lookup_buffer_offset(page, &b_offset, type); + if (found) { + /* + * The found offset may be less than the start + * point to search if this is the first time to + * come here. + */ + *offset = max_t(loff_t, startoff, b_offset); + unlock_page(page); + goto out; + } + + /* + * We either searching data but nothing was found, or + * searching hole but found a data buffer. In either + * case, probably the next page contains the desired + * things, update the last offset to it so. + */ + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The number of returned pages less than our desired, search + * done. In this case, nothing was found for searching data, + * but we found a hole behind the last offset. + */ + if (nr_pages < want) { + if (type == HOLE_OFF) { + *offset = lastoff; + found = true; + } + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + STATIC loff_t xfs_seek_data( struct file *file, -- cgit v1.2.3 From 52f1acc8b56a333fbc7218711c3fa2fb3bf78b92 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 21 Aug 2012 17:12:07 +0800 Subject: xfs: xfs_seek_data() refinement with unwritten extents check up from page cache xfs_seek_data() refinement with unwritten extents check up from page cache. Signed-off-by: Jie Liu Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 72 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 18 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d78a746b6c7..3f9107431df 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1186,8 +1186,6 @@ xfs_seek_data( struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec map[2]; - int nmap = 2; loff_t uninitialized_var(offset); xfs_fsize_t isize; xfs_fileoff_t fsbno; @@ -1203,36 +1201,74 @@ xfs_seek_data( goto out_unlock; } - fsbno = XFS_B_TO_FSBT(mp, start); - /* * Try to read extents from the first block indicated * by fsbno to the end block of the file. */ + fsbno = XFS_B_TO_FSBT(mp, start); end = XFS_B_TO_FSB(mp, isize); + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; - /* - * Treat unwritten extent as data extent since it might - * contains dirty data in page cache. - */ - if (map[0].br_startblock != HOLESTARTBLOCK) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[0].br_startoff)); - } else { + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a data extent */ + if (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock))) + goto out; + + /* + * Landed in an unwritten extent, try to search data + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + DATA_OFF, &offset)) + goto out; + } + } + + /* + * map[0] is hole or its an unwritten extent but + * without data in page cache. Probably means that + * we are reading after EOF if nothing in map[1]. + */ if (nmap == 1) { error = ENXIO; goto out_unlock; } - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[1].br_startoff)); + ASSERT(i > 1); + + /* + * Nothing was found, proceed to the next round of search + * if reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } } +out: if (offset != file->f_pos) file->f_pos = offset; -- cgit v1.2.3 From b686d1f79acb65c6a34473c15fcfa2ee54aed8e2 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 21 Aug 2012 17:12:18 +0800 Subject: xfs: xfs_seek_hole() refinement with hole searching from page cache for unwritten extents xfs_seek_hole() refinement with hole searching from page cache for unwritten extent. Signed-off-by: Jie Liu Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 11 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3f9107431df..1eaeb8be3aa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1289,9 +1289,9 @@ xfs_seek_hole( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; loff_t uninitialized_var(offset); - loff_t holeoff; xfs_fsize_t isize; xfs_fileoff_t fsbno; + xfs_filblks_t end; uint lock; int error; @@ -1307,21 +1307,77 @@ xfs_seek_hole( } fsbno = XFS_B_TO_FSBT(mp, start); - error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); - if (error) - goto out_unlock; + end = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a hole */ + if (map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* + * Landed in an unwritten extent, try to search hole + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + HOLE_OFF, &offset)) + goto out; + } + } - holeoff = XFS_FSB_TO_B(mp, fsbno); - if (holeoff <= start) - offset = start; - else { /* - * xfs_bmap_first_unused() could return a value bigger than - * isize if there are no more holes past the supplied offset. + * map[0] contains data or its unwritten but contains + * data in page cache, probably means that we are + * reading after EOF. We should fix offset to point + * to the end of the file(i.e., there is an implicit + * hole at the end of any file). */ - offset = min_t(loff_t, holeoff, isize); + if (nmap == 1) { + offset = isize; + break; + } + + ASSERT(i > 1); + + /* + * Both mappings contains data, proceed to the next round of + * search if the current reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + offset = isize; + break; + } } +out: + /* + * At this point, we must have found a hole. However, the returned + * offset may be bigger than the file size as it may be aligned to + * page boundary for unwritten extents, we need to deal with this + * situation in particular. + */ + offset = min_t(loff_t, offset, isize); if (offset != file->f_pos) file->f_pos = offset; -- cgit v1.2.3 From 6fb8a90aa3f2319a25f3396b1e9273300f8903b8 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 10 Aug 2012 15:01:51 -0300 Subject: xfs: fix race while discarding buffers [V4] While xfs_buftarg_shrink() is freeing buffers from the dispose list (filled with buffers from lru list), there is a possibility to have xfs_buf_stale() racing with it, and removing buffers from dispose list before xfs_buftarg_shrink() does it. This happens because xfs_buftarg_shrink() handle the dispose list without locking and the test condition in xfs_buf_stale() checks for the buffer being in *any* list: if (!list_empty(&bp->b_lru)) If the buffer happens to be on dispose list, this causes the buffer counter of lru list (btp->bt_lru_nr) to be decremented twice (once in xfs_buftarg_shrink() and another in xfs_buf_stale()) causing a wrong account usage of the lru list. This may cause xfs_buftarg_shrink() to return a wrong value to the memory shrinker shrink_slab(), and such account error may also cause an underflowed value to be returned; since the counter is lower than the current number of items in the lru list, a decrement may happen when the counter is 0, causing an underflow on the counter. The fix uses a new flag field (and a new buffer flag) to serialize buffer handling during the shrink process. The new flag field has been designed to use btp->bt_lru_lock/unlock instead of xfs_buf_lock/unlock mechanism. dchinner, sandeen, aquini and aris also deserve credits for this. Signed-off-by: Carlos Maiolino Reviewed-by: Ben Myers Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 5 ++++- fs/xfs/xfs_buf.h | 41 ++++++++++++++++++++++++----------------- 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d7a9dd735e1..933b7930b86 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -96,6 +96,7 @@ xfs_buf_lru_add( atomic_inc(&bp->b_hold); list_add_tail(&bp->b_lru, &btp->bt_lru); btp->bt_lru_nr++; + bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); } @@ -154,7 +155,8 @@ xfs_buf_stale( struct xfs_buftarg *btp = bp->b_target; spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { + if (!list_empty(&bp->b_lru) && + !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { list_del_init(&bp->b_lru); btp->bt_lru_nr--; atomic_dec(&bp->b_hold); @@ -1501,6 +1503,7 @@ xfs_buftarg_shrink( */ list_move(&bp->b_lru, &dispose); btp->bt_lru_nr--; + bp->b_lru_flags |= _XBF_LRU_DISPOSE; } spin_unlock(&btp->bt_lru_lock); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d03b73b9604..7c0b6a0a155 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -38,27 +38,28 @@ typedef enum { XBRW_ZERO = 3, /* Zero target memory */ } xfs_buf_rw_t; -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ -#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ -#define XBF_FUA (1 << 11)/* force cache write through mode */ -#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ -#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t; { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } typedef struct xfs_buftarg { dev_t bt_dev; @@ -124,7 +126,12 @@ typedef struct xfs_buf { xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ struct list_head b_lru; /* lru list */ + xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ -- cgit v1.2.3 From 4026c9fde9c67266932afd209e25bfef4474a1be Mon Sep 17 00:00:00 2001 From: Ben Myers Date: Thu, 13 Sep 2012 16:18:47 -0500 Subject: xfs: stop the sync worker before xfs_unmountfs Cancel work of the xfs_sync_worker before teardown of the log in xfs_unmountfs. This prevents occasional crashes on unmount like so: PID: 21602 TASK: ee9df060 CPU: 0 COMMAND: "kworker/0:3" #0 [c5377d28] crash_kexec at c0292c94 #1 [c5377d80] oops_end at c07090c2 #2 [c5377d98] no_context at c06f614e #3 [c5377dbc] __bad_area_nosemaphore at c06f6281 #4 [c5377df4] bad_area_nosemaphore at c06f629b #5 [c5377e00] do_page_fault at c070b0cb #6 [c5377e7c] error_code (via page_fault) at c070892c EAX: f300c6a8 EBX: f300c6a8 ECX: 000000c0 EDX: 000000c0 EBP: c5377ed0 DS: 007b ESI: 00000000 ES: 007b EDI: 00000001 GS: ffffad20 CS: 0060 EIP: c0481ad0 ERR: ffffffff EFLAGS: 00010246 #7 [c5377eb0] atomic64_read_cx8 at c0481ad0 #8 [c5377ebc] xlog_assign_tail_lsn_locked at f7cc7c6e [xfs] #9 [c5377ed4] xfs_trans_ail_delete_bulk at f7ccd520 [xfs] #10 [c5377f0c] xfs_buf_iodone at f7ccb602 [xfs] #11 [c5377f24] xfs_buf_do_callbacks at f7cca524 [xfs] #12 [c5377f30] xfs_buf_iodone_callbacks at f7cca5da [xfs] #13 [c5377f4c] xfs_buf_iodone_work at f7c718d0 [xfs] #14 [c5377f58] process_one_work at c024ee4c #15 [c5377f98] worker_thread at c024f43d #16 [c5377fbc] kthread at c025326b #17 [c5377fe8] kernel_thread_helper at c070e834 PID: 26653 TASK: e79143b0 CPU: 3 COMMAND: "umount" #0 [cde0fda0] __schedule at c0706595 #1 [cde0fe28] schedule at c0706b89 #2 [cde0fe30] schedule_timeout at c0705600 #3 [cde0fe94] __down_common at c0706098 #4 [cde0fec8] __down at c0706122 #5 [cde0fed0] down at c025936f #6 [cde0fee0] xfs_buf_lock at f7c7131d [xfs] #7 [cde0ff00] xfs_freesb at f7cc2236 [xfs] #8 [cde0ff10] xfs_fs_put_super at f7c80f21 [xfs] #9 [cde0ff1c] generic_shutdown_super at c0333d7a #10 [cde0ff38] kill_block_super at c0333e0f #11 [cde0ff48] deactivate_locked_super at c0334218 #12 [cde0ff58] deactivate_super at c033495d #13 [cde0ff68] mntput_no_expire at c034bc13 #14 [cde0ff7c] sys_umount at c034cc69 #15 [cde0ffa0] sys_oldumount at c034ccd4 #16 [cde0ffb0] system_call at c0707e66 commit 11159a05 added this to xfs_log_unmount and needs to be cleaned up at a later date. Signed-off-by: Ben Myers Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely --- fs/xfs/xfs_super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bdaf4cb9f4a..19e2380fb86 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -919,6 +919,7 @@ xfs_fs_put_super( struct xfs_mount *mp = XFS_M(sb); xfs_filestream_unmount(mp); + cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); xfs_syncd_stop(mp); xfs_freesb(mp); -- cgit v1.2.3 From 5f3a4a28ec140a90e6058d1d09f6b1f235d485e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 20:17:44 -0700 Subject: userns: Pass a userns parameter into posix_acl_to_xattr and posix_acl_from_xattr - Pass the user namespace the uid and gid values in the xattr are stored in into posix_acl_from_xattr. - Pass the user namespace kuid and kgid values should be converted into when storing uid and gid values in an xattr in posix_acl_to_xattr. - Modify all callers of posix_acl_from_xattr and posix_acl_to_xattr to pass in &init_user_ns. In the short term this change is not strictly needed but it makes the code clearer. In the longer term this change is necessary to be able to mount filesystems outside of the initial user namespace that natively store posix acls in the linux xattr format. Cc: Theodore Tso Cc: Andrew Morton Cc: Andreas Dilger Cc: Jan Kara Cc: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/xfs/xfs_acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index ac702a6eab9..1d32f1d5276 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -337,7 +337,7 @@ xfs_xattr_acl_get(struct dentry *dentry, const char *name, if (acl == NULL) return -ENODATA; - error = posix_acl_to_xattr(acl, value, size); + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); posix_acl_release(acl); return error; @@ -361,7 +361,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, if (!value) goto set_acl; - acl = posix_acl_from_xattr(value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (!acl) { /* * acl_set_file(3) may request that we set default ACLs with -- cgit v1.2.3 From 74a8a103789465c4e67f38d1abb5cea770002601 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 16 Sep 2012 02:07:49 -0700 Subject: userns: Convert qutoactl Update the quotactl user space interface to successfull compile with user namespaces support enabled and to hand off quota identifiers to lower layers of the kernel in struct kqid instead of type and qid pairs. The quota on function is not converted because while it takes a quota type and an id. The id is the on disk quota format to use, which is something completely different. The signature of two struct quotactl_ops methods were changed to take struct kqid argumetns get_dqblk and set_dqblk. The dquot, xfs, and ocfs2 implementations of get_dqblk and set_dqblk are minimally changed so that the code continues to work with the change in parameter type. This is the first in a series of changes to always store quota identifiers in the kernel in struct kqid and only use raw type and qid values when interacting with on disk structures or userspace. Always using struct kqid internally makes it hard to miss places that need conversion to or from the kernel internal values. Cc: Jan Kara Cc: Dave Chinner Cc: Mark Fasheh Cc: Joel Becker Cc: Ben Myers Cc: Alex Elder Signed-off-by: "Eric W. Biederman" --- fs/xfs/xfs_quotaops.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index fed504fc299..71926d63052 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -97,8 +97,7 @@ xfs_fs_set_xstate( STATIC int xfs_fs_get_dqblk( struct super_block *sb, - int type, - qid_t id, + struct kqid qid, struct fs_disk_quota *fdq) { struct xfs_mount *mp = XFS_M(sb); @@ -108,14 +107,14 @@ xfs_fs_get_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); + return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); } STATIC int xfs_fs_set_dqblk( struct super_block *sb, - int type, - qid_t id, + struct kqid qid, struct fs_disk_quota *fdq) { struct xfs_mount *mp = XFS_M(sb); @@ -127,7 +126,8 @@ xfs_fs_set_dqblk( if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); + return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { -- cgit v1.2.3 From 431f19744d15531825cdbc8e771b43854b0d005b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 16 Sep 2012 02:32:43 -0700 Subject: userns: Convert quota netlink aka quota_send_warning Modify quota_send_warning to take struct kqid instead a type and identifier pair. When sending netlink broadcasts always convert uids and quota identifiers into the intial user namespace. There is as yet no way to send a netlink broadcast message with different contents to receivers in different namespaces, so for the time being just map all of the identifiers into the initial user namespace which preserves the current behavior. Change the callers of quota_send_warning in gfs2, xfs and dquot to generate a struct kqid to pass to quota send warning. When all of the user namespaces convesions are complete a struct kqid values will be availbe without need for conversion, but a conversion is needed now to avoid needing to convert everything at once. Cc: Ben Myers Cc: Alex Elder Cc: Dave Chinner Cc: Jan Kara Cc: Steven Whitehouse Signed-off-by: "Eric W. Biederman" --- fs/xfs/xfs_trans_dquot.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index bcb60542fcf..0c7fa54f309 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -578,9 +578,11 @@ xfs_quota_warn( /* no warnings for project quotas - we just return ENOSPC later */ if (dqp->dq_flags & XFS_DQ_PROJ) return; - quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA, - be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev, - type); + quota_send_warning(make_kqid(&init_user_ns, + (dqp->dq_flags & XFS_DQ_USER) ? + USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id)), + mp->m_super->s_dev, type); } /* -- cgit v1.2.3 From 0ba6e5368c302819da7aff537e0d7eff2616c6a6 Mon Sep 17 00:00:00 2001 From: Ben Myers Date: Thu, 13 Sep 2012 16:18:47 -0500 Subject: xfs: stop the sync worker before xfs_unmountfs Cancel work of the xfs_sync_worker before teardown of the log in xfs_unmountfs. This prevents occasional crashes on unmount like so: PID: 21602 TASK: ee9df060 CPU: 0 COMMAND: "kworker/0:3" #0 [c5377d28] crash_kexec at c0292c94 #1 [c5377d80] oops_end at c07090c2 #2 [c5377d98] no_context at c06f614e #3 [c5377dbc] __bad_area_nosemaphore at c06f6281 #4 [c5377df4] bad_area_nosemaphore at c06f629b #5 [c5377e00] do_page_fault at c070b0cb #6 [c5377e7c] error_code (via page_fault) at c070892c EAX: f300c6a8 EBX: f300c6a8 ECX: 000000c0 EDX: 000000c0 EBP: c5377ed0 DS: 007b ESI: 00000000 ES: 007b EDI: 00000001 GS: ffffad20 CS: 0060 EIP: c0481ad0 ERR: ffffffff EFLAGS: 00010246 #7 [c5377eb0] atomic64_read_cx8 at c0481ad0 #8 [c5377ebc] xlog_assign_tail_lsn_locked at f7cc7c6e [xfs] #9 [c5377ed4] xfs_trans_ail_delete_bulk at f7ccd520 [xfs] #10 [c5377f0c] xfs_buf_iodone at f7ccb602 [xfs] #11 [c5377f24] xfs_buf_do_callbacks at f7cca524 [xfs] #12 [c5377f30] xfs_buf_iodone_callbacks at f7cca5da [xfs] #13 [c5377f4c] xfs_buf_iodone_work at f7c718d0 [xfs] #14 [c5377f58] process_one_work at c024ee4c #15 [c5377f98] worker_thread at c024f43d #16 [c5377fbc] kthread at c025326b #17 [c5377fe8] kernel_thread_helper at c070e834 PID: 26653 TASK: e79143b0 CPU: 3 COMMAND: "umount" #0 [cde0fda0] __schedule at c0706595 #1 [cde0fe28] schedule at c0706b89 #2 [cde0fe30] schedule_timeout at c0705600 #3 [cde0fe94] __down_common at c0706098 #4 [cde0fec8] __down at c0706122 #5 [cde0fed0] down at c025936f #6 [cde0fee0] xfs_buf_lock at f7c7131d [xfs] #7 [cde0ff00] xfs_freesb at f7cc2236 [xfs] #8 [cde0ff10] xfs_fs_put_super at f7c80f21 [xfs] #9 [cde0ff1c] generic_shutdown_super at c0333d7a #10 [cde0ff38] kill_block_super at c0333e0f #11 [cde0ff48] deactivate_locked_super at c0334218 #12 [cde0ff58] deactivate_super at c033495d #13 [cde0ff68] mntput_no_expire at c034bc13 #14 [cde0ff7c] sys_umount at c034cc69 #15 [cde0ffa0] sys_oldumount at c034ccd4 #16 [cde0ffb0] system_call at c0707e66 commit 11159a05 added this to xfs_log_unmount and needs to be cleaned up at a later date. Signed-off-by: Ben Myers Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely --- fs/xfs/xfs_super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bdaf4cb9f4a..19e2380fb86 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -919,6 +919,7 @@ xfs_fs_put_super( struct xfs_mount *mp = XFS_M(sb); xfs_filestream_unmount(mp); + cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); xfs_syncd_stop(mp); xfs_freesb(mp); -- cgit v1.2.3 From c3a58fecdd1934a8538ada9073107625f5151687 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 17 Aug 2012 18:19:38 -0300 Subject: Make inode64 a remountable option Actually, there is no reason about why a user must umount and mount a XFS filesystem to enable 'inode64' option. So, this patch makes this a remountable option. Signed-off-by: Carlos Maiolino Reviewed-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 19e2380fb86..c416a01fcb1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -120,12 +120,13 @@ mempool_t *xfs_ioend_pool; * in the future, too. */ enum { - Opt_barrier, Opt_nobarrier, Opt_err + Opt_barrier, Opt_nobarrier, Opt_inode64, Opt_err }; static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, + {Opt_inode64, "inode64"}, {Opt_err, NULL} }; @@ -1031,6 +1032,30 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } +STATIC void +xfs_set_inode64(struct xfs_mount *mp) +{ + int i = 0; + + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, i); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + mp->m_maxagi = i; +} + STATIC int xfs_fs_remount( struct super_block *sb, @@ -1056,6 +1081,9 @@ xfs_fs_remount( case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; break; + case Opt_inode64: + xfs_set_inode64(mp); + break; default: /* * Logically we would return an error here to prevent -- cgit v1.2.3 From 8aea3ff411b2ce8fe7b46644298ed243a920eb24 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 20 Sep 2012 10:32:36 -0300 Subject: xfs: Fix m_agirotor reset during AG selection xfs_ialloc_next_ag() currently resets m_agirotor when it is equal to m_maxagi: if (++mp->m_agirotor == mp->m_maxagi) mp->m_agirotor = 0; But, if for some reason mp->m_maxagi changes to a lower value than current m_agirotor, this condition will never be true, causing m_agirotor to exceed the maximum allowed value (m_maxagi). This implies mainly during lookups for xfs_perag structs in its radix tree, since the agno value used for the lookup is based on m_agirotor. An out-of-range m_agirotor may cause a lookup failure which in case will return NULL. As an example, the value of m_maxagi is decreased during inode64->inode32 remount process, case where I've found this problem. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5aceb3f8ecd..445bf1aef31 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -431,7 +431,7 @@ xfs_ialloc_next_ag( spin_lock(&mp->m_agirotor_lock); agno = mp->m_agirotor; - if (++mp->m_agirotor == mp->m_maxagi) + if (++mp->m_agirotor >= mp->m_maxagi) mp->m_agirotor = 0; spin_unlock(&mp->m_agirotor_lock); -- cgit v1.2.3 From 08bf540412ed82a15cb9068249ad49b410a7b082 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 20 Sep 2012 10:32:37 -0300 Subject: xfs: make inode64 as the default allocation mode since 64-bit inodes can be accessed while using inode32, and these can also be used on 32-bit kernels, there is no reason to still keep inode32 as the default mount option. If the filesystem cannot handle 64bit inode numbers (i.e CONFIG_LBDAF is not enabled and BITS_PER_LONG == 32), XFS_MOUNT_SMALL_INUMS will still be set by default, so inode64 is not an unconditional default value. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c416a01fcb1..996257d36fd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -88,6 +88,8 @@ mempool_t *xfs_ioend_pool; * unwritten extent conversion */ #define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to + * XFS_MAXINUMBER_32 */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ #define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ #define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ @@ -198,7 +200,9 @@ xfs_parseargs( */ mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; +#if !XFS_BIG_INUMS mp->m_flags |= XFS_MOUNT_SMALL_INUMS; +#endif /* * These can be overridden by the mount option parsing. @@ -295,6 +299,8 @@ xfs_parseargs( return EINVAL; } dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; #if !XFS_BIG_INUMS @@ -493,6 +499,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { -- cgit v1.2.3 From 2d2194f61fddab3a9731b6e7a7ae3a4a19dd810c Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 20 Sep 2012 10:32:38 -0300 Subject: xfs: reduce code duplication handling inode32/64 options Add xfs_set_inode32() to be used to enable inode32 allocation mode. this will reduce the amount of duplicated code needed to mount/remount a filesystem with inode32 option. This patch also changes xfs_set_inode64() to return the maximum AG number that inodes can be allocated instead of set mp->m_maxagi by itself, so that the behaviour is the same as xfs_set_inode32(). This simplifies code that calls these functions and needs to know the maximum AG that inodes can be allocated in. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 43 +++----------------------- fs/xfs/xfs_super.c | 89 +++++++++++++++++++++++++++++++++++++++--------------- fs/xfs/xfs_super.h | 2 ++ 3 files changed, 72 insertions(+), 62 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 29c2f83d414..b2bd3a0e637 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -440,7 +440,7 @@ xfs_initialize_perag( xfs_agnumber_t agcount, xfs_agnumber_t *maxagi) { - xfs_agnumber_t index, max_metadata; + xfs_agnumber_t index; xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; @@ -500,43 +500,10 @@ xfs_initialize_perag( else mp->m_flags &= ~XFS_MOUNT_32BITINODES; - if (mp->m_flags & XFS_MOUNT_32BITINODES) { - /* - * Calculate how much should be reserved for inodes to meet - * the max inode percentage. - */ - if (mp->m_maxicount) { - __uint64_t icount; - - icount = sbp->sb_dblocks * sbp->sb_imax_pct; - do_div(icount, 100); - icount += sbp->sb_agblocks - 1; - do_div(icount, sbp->sb_agblocks); - max_metadata = icount; - } else { - max_metadata = agcount; - } - - for (index = 0; index < agcount; index++) { - ino = XFS_AGINO_TO_INO(mp, index, agino); - if (ino > XFS_MAXINUMBER_32) { - index++; - break; - } - - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - if (index < max_metadata) - pag->pagf_metadata = 1; - xfs_perag_put(pag); - } - } else { - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - pag->pagi_inodeok = 1; - xfs_perag_put(pag); - } - } + if (mp->m_flags & XFS_MOUNT_32BITINODES) + index = xfs_set_inode32(mp); + else + index = xfs_set_inode64(mp); if (maxagi) *maxagi = index; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 996257d36fd..d6619d68553 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -599,6 +599,71 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } +xfs_agnumber_t +xfs_set_inode32(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + xfs_sb_t *sbp = &mp->m_sb; + xfs_agnumber_t max_metadata; + xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); + xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino); + xfs_perag_t *pag; + + /* Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = sbp->sb_agcount; + } + + for (index = 0; index < sbp->sb_agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + if (ino > XFS_MAXINUMBER_32) { + index++; + break; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + return index; +} + +xfs_agnumber_t +xfs_set_inode64(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + + for (index = 0; index < mp->m_sb.sb_agcount; index++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + return index; +} + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -1039,30 +1104,6 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } -STATIC void -xfs_set_inode64(struct xfs_mount *mp) -{ - int i = 0; - - for (i = 0; i < mp->m_sb.sb_agcount; i++) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, i); - pag->pagi_inodeok = 1; - pag->pagf_metadata = 0; - xfs_perag_put(pag); - } - - /* There is no need for lock protection on m_flags, - * the rw_semaphore of the VFS superblock is locked - * during mount/umount/remount operations, so this is - * enough to avoid concurency on the m_flags field - */ - mp->m_flags &= ~(XFS_MOUNT_32BITINODES | - XFS_MOUNT_SMALL_INUMS); - mp->m_maxagi = i; -} - STATIC int xfs_fs_remount( struct super_block *sb, diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 09b0c26b224..9de4a920ba0 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -75,6 +75,8 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); +extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); +extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; -- cgit v1.2.3 From 4c0837224c677db35cd85b04a77504c496cadb66 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 20 Sep 2012 10:32:39 -0300 Subject: xfs: Fix mp->m_maxagi update during inode64 remount With the changes made on xfs_set_inode64(), to make it behave as xfs_set_inode32() (now leaving to the caller the responsibility to update mp->m_maxagi), we use the return value of xfs_set_inode64() to update mp->m_maxagi during remount. Signed-off-by: Carlos Maiolino Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d6619d68553..aeb03f9a896 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1130,7 +1130,7 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: - xfs_set_inode64(mp); + mp->m_maxagi = xfs_set_inode64(mp); break; default: /* -- cgit v1.2.3 From 4056c1d08d2a7c50ae7414db7c1783ba45b4835d Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 20 Sep 2012 10:32:40 -0300 Subject: xfs: add inode64->inode32 transition into xfs_set_inode32() To make inode32 a remountable option, xfs_set_inode32() should be able to make a transition from inode64 option, disabling inode allocation on higher AGs. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aeb03f9a896..168d4984ce8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -603,6 +603,7 @@ xfs_agnumber_t xfs_set_inode32(struct xfs_mount *mp) { xfs_agnumber_t index = 0; + xfs_agnumber_t maxagi = 0; xfs_sb_t *sbp = &mp->m_sb; xfs_agnumber_t max_metadata; xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); @@ -626,18 +627,26 @@ xfs_set_inode32(struct xfs_mount *mp) for (index = 0; index < sbp->sb_agcount; index++) { ino = XFS_AGINO_TO_INO(mp, index, agino); + if (ino > XFS_MAXINUMBER_32) { - index++; - break; + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + continue; } pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; + maxagi++; if (index < max_metadata) pag->pagf_metadata = 1; xfs_perag_put(pag); } - return index; + mp->m_flags |= (XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + + return maxagi; } xfs_agnumber_t -- cgit v1.2.3 From 2ea0392983a82f7dc3055568ae0f2558724d119b Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 20 Sep 2012 10:32:41 -0300 Subject: xfs: Make inode32 a remountable option As inode64 is the default option now, and was also made remountable previously, inode32 can also be remounted on-the-fly when it is needed. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 168d4984ce8..d93f2c7364c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -122,13 +122,18 @@ mempool_t *xfs_ioend_pool; * in the future, too. */ enum { - Opt_barrier, Opt_nobarrier, Opt_inode64, Opt_err + Opt_barrier, + Opt_nobarrier, + Opt_inode64, + Opt_inode32, + Opt_err }; static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_inode64, "inode64"}, + {Opt_inode32, "inode32"}, {Opt_err, NULL} }; @@ -1141,6 +1146,9 @@ xfs_fs_remount( case Opt_inode64: mp->m_maxagi = xfs_set_inode64(mp); break; + case Opt_inode32: + mp->m_maxagi = xfs_set_inode32(mp); + break; default: /* * Logically we would return an error here to prevent -- cgit v1.2.3 From 1ea65c96077f9bb5c0e5e224a4da751d269c5f94 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 12:57:12 -0400 Subject: switch xfs_swapext() to fget_light() Signed-off-by: Al Viro --- fs/xfs/xfs_dfrag.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e00de08dc8a..e6cdf224d7e 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -49,10 +49,10 @@ xfs_swapext( { xfs_inode_t *ip, *tip; struct file *file, *tmp_file; - int error = 0; + int error = 0, fput_needed, fput_needed_tmp; /* Pull information for the target fd */ - file = fget((int)sxp->sx_fdtarget); + file = fget_light((int)sxp->sx_fdtarget, &fput_needed); if (!file) { error = XFS_ERROR(EINVAL); goto out; @@ -65,7 +65,7 @@ xfs_swapext( goto out_put_file; } - tmp_file = fget((int)sxp->sx_fdtmp); + tmp_file = fget_light((int)sxp->sx_fdtmp, &fput_needed_tmp); if (!tmp_file) { error = XFS_ERROR(EINVAL); goto out_put_file; @@ -105,9 +105,9 @@ xfs_swapext( error = xfs_swap_extents(ip, tip, sxp); out_put_tmp_file: - fput(tmp_file); + fput_light(tmp_file, fput_needed_tmp); out_put_file: - fput(file); + fput_light(file, fput_needed); out: return error; } -- cgit v1.2.3 From 64e09fa2e1fef1696a8685c7aad7e0d3dd24ce71 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 12:59:52 -0400 Subject: switch xfs_find_handle() to fget_light() Signed-off-by: Al Viro --- fs/xfs/xfs_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e0232c3b6d..21483eac402 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -72,11 +72,11 @@ xfs_find_handle( struct inode *inode; struct file *file = NULL; struct path path; - int error; + int error, fput_needed; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - file = fget(hreq->fd); + file = fget_light(hreq->fd, &fput_needed); if (!file) return -EBADF; inode = file->f_path.dentry->d_inode; @@ -134,7 +134,7 @@ xfs_find_handle( out_put: if (cmd == XFS_IOC_FD_TO_HANDLE) - fput(file); + fput_light(file, fput_needed); else path_put(&path); return error; -- cgit v1.2.3 From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Aug 2012 12:52:22 -0400 Subject: switch simple cases of fget_light to fdget Signed-off-by: Al Viro --- fs/xfs/xfs_dfrag.c | 36 ++++++++++++++++++------------------ fs/xfs/xfs_ioctl.c | 12 ++++++------ 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e6cdf224d7e..b9b8646e62d 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -48,44 +48,44 @@ xfs_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct file *file, *tmp_file; - int error = 0, fput_needed, fput_needed_tmp; + struct fd f, tmp; + int error = 0; /* Pull information for the target fd */ - file = fget_light((int)sxp->sx_fdtarget, &fput_needed); - if (!file) { + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { error = XFS_ERROR(EINVAL); goto out; } - if (!(file->f_mode & FMODE_WRITE) || - !(file->f_mode & FMODE_READ) || - (file->f_flags & O_APPEND)) { + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_file; } - tmp_file = fget_light((int)sxp->sx_fdtmp, &fput_needed_tmp); - if (!tmp_file) { + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { error = XFS_ERROR(EINVAL); goto out_put_file; } - if (!(tmp_file->f_mode & FMODE_WRITE) || - !(tmp_file->f_mode & FMODE_READ) || - (tmp_file->f_flags & O_APPEND)) { + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { error = XFS_ERROR(EBADF); goto out_put_tmp_file; } - if (IS_SWAPFILE(file->f_path.dentry->d_inode) || - IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) { + if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || + IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) { error = XFS_ERROR(EINVAL); goto out_put_tmp_file; } - ip = XFS_I(file->f_path.dentry->d_inode); - tip = XFS_I(tmp_file->f_path.dentry->d_inode); + ip = XFS_I(f.file->f_path.dentry->d_inode); + tip = XFS_I(tmp.file->f_path.dentry->d_inode); if (ip->i_mount != tip->i_mount) { error = XFS_ERROR(EINVAL); @@ -105,9 +105,9 @@ xfs_swapext( error = xfs_swap_extents(ip, tip, sxp); out_put_tmp_file: - fput_light(tmp_file, fput_needed_tmp); + fdput(tmp); out_put_file: - fput_light(file, fput_needed); + fdput(f); out: return error; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 21483eac402..8305f2ac677 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,16 +70,16 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct file *file = NULL; + struct fd f; struct path path; - int error, fput_needed; + int error; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - file = fget_light(hreq->fd, &fput_needed); - if (!file) + f = fdget(hreq->fd); + if (!f.file) return -EBADF; - inode = file->f_path.dentry->d_inode; + inode = f.file->f_path.dentry->d_inode; } else { error = user_lpath((const char __user *)hreq->path, &path); if (error) @@ -134,7 +134,7 @@ xfs_find_handle( out_put: if (cmd == XFS_IOC_FD_TO_HANDLE) - fput_light(file, fput_needed); + fdput(f); else path_put(&path); return error; -- cgit v1.2.3 From 8c0a85377048b64c880e76ec7368904fe46d0b94 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 26 Sep 2012 11:33:07 +1000 Subject: fs: push rcu_barrier() from deactivate_locked_super() to filesystems There's no reason to call rcu_barrier() on every deactivate_locked_super(). We only need to make sure that all delayed rcu free inodes are flushed before we destroy related cache. Removing rcu_barrier() from deactivate_locked_super() affects some fast paths. E.g. on my machine exit_group() of a last process in IPC namespace takes 0.07538s. rcu_barrier() takes 0.05188s of that time. Signed-off-by: Kirill A. Shutemov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/xfs/xfs_super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 19e2380fb86..83d36e473d2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1506,6 +1506,11 @@ xfs_init_zones(void) STATIC void xfs_destroy_zones(void) { + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); kmem_zone_destroy(xfs_efi_zone); -- cgit v1.2.3 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1eaeb8be3aa..aa473fa640a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -940,7 +940,6 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; file_accessed(filp); return 0; @@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; -- cgit v1.2.3 From 35c2a7f4908d404c9124c2efc6ada4640ca4d5d5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 7 Oct 2012 20:32:51 -0700 Subject: tmpfs,ceph,gfs2,isofs,reiserfs,xfs: fix fh_len checking Fuzzing with trinity oopsed on the 1st instruction of shmem_fh_to_dentry(), u64 inum = fid->raw[2]; which is unhelpfully reported as at the end of shmem_alloc_inode(): BUG: unable to handle kernel paging request at ffff880061cd3000 IP: [] shmem_alloc_inode+0x40/0x40 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Call Trace: [] ? exportfs_decode_fh+0x79/0x2d0 [] do_handle_open+0x163/0x2c0 [] sys_open_by_handle_at+0xc/0x10 [] tracesys+0xe1/0xe6 Right, tmpfs is being stupid to access fid->raw[2] before validating that fh_len includes it: the buffer kmalloc'ed by do_sys_name_to_handle() may fall at the end of a page, and the next page not be present. But some other filesystems (ceph, gfs2, isofs, reiserfs, xfs) are being careless about fh_len too, in fh_to_dentry() and/or fh_to_parent(), and could oops in the same way: add the missing fh_len checks to those. Reported-by: Sasha Levin Signed-off-by: Hugh Dickins Cc: Al Viro Cc: Sage Weil Cc: Steven Whitehouse Cc: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/xfs/xfs_export.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 42679223a0f..8c6d1d70278 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -189,6 +189,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; struct inode *inode = NULL; + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + switch (fileid_type) { case FILEID_INO32_GEN_PARENT: inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, -- cgit v1.2.3 From 33c7a2bc48a81fa714572f8ce29f29bc17e6faf0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:55:59 +1100 Subject: xfs: xfs_syncd_stop must die xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated functionailty together that actually have different start and stop requirements. Kill these functions and open code the start/stop methods for each of the background functions. Subsequent patches will move the start/stop functions around to the correct places to avoid races and shutdown issues. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 25 ++++++++++++++++++------- fs/xfs/xfs_sync.c | 30 ++++-------------------------- fs/xfs/xfs_sync.h | 6 ++++-- 3 files changed, 26 insertions(+), 35 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 26a09bd7f97..37d1bbce047 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1008,7 +1008,11 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1384,9 +1388,11 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + + xfs_syncd_queue_sync(mp); error = xfs_mountfs(mp); if (error) @@ -1409,8 +1415,10 @@ xfs_fs_fill_super( return 0; out_syncd_stop: - xfs_syncd_stop(mp); - out_filestream_unmount: + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); + xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1429,7 +1437,10 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 9500caf15ac..7502f0621fb 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -370,7 +370,7 @@ xfs_quiesce_attr( xfs_buf_unlock(mp->m_sb_bp); } -static void +void xfs_syncd_queue_sync( struct xfs_mount *mp) { @@ -383,7 +383,7 @@ xfs_syncd_queue_sync( * disk quotas. We might need to cover the log to indicate that the * filesystem is idle and not frozen. */ -STATIC void +void xfs_sync_worker( struct work_struct *work) { @@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim( * goes low. It scans as quickly as possible avoiding locked inodes or those * already being flushed, and once done schedules a future pass. */ -STATIC void +void xfs_reclaim_worker( struct work_struct *work) { @@ -478,7 +478,7 @@ xfs_flush_inodes( flush_work(&mp->m_flush_work); } -STATIC void +void xfs_flush_worker( struct work_struct *work) { @@ -489,28 +489,6 @@ xfs_flush_worker( xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); } -int -xfs_syncd_init( - struct xfs_mount *mp) -{ - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - - return 0; -} - -void -xfs_syncd_stop( - struct xfs_mount *mp) -{ - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6..3f59e5bed66 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,8 +26,10 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -int xfs_syncd_init(struct xfs_mount *mp); -void xfs_syncd_stop(struct xfs_mount *mp); +void xfs_syncd_queue_sync(struct xfs_mount *mp); +void xfs_sync_worker(struct work_struct *work); +void xfs_flush_worker(struct work_struct *work); +void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -- cgit v1.2.3 From 7e18530bef6a18a5479690ae7e8256319ecf1300 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:00 +1100 Subject: xfs: rationalise xfs_mount_wq users Instead of starting and stopping background work on the xfs_mount_wq all at the same time, separate them to where they really are needed to start and stop. The xfs_sync_worker, only needs to be started after all the mount processing has completed successfully, while it needs to be stopped before the log is unmounted. The xfs_reclaim_worker is started on demand, and can be stopped before the unmount process does it's own inode reclaim pass. The xfs_flush_inodes work is run on demand, and so we really only need to ensure that it has stopped running before we start processing an unmount, freeze or remount,ro. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 6 ++++-- fs/xfs/xfs_super.c | 34 ++++++++++++++-------------------- fs/xfs/xfs_sync.c | 21 +++++---------------- 3 files changed, 23 insertions(+), 38 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b2bd3a0e637..d9a31c6a0c5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1450,9 +1450,11 @@ xfs_unmountfs( /* * And reclaim all inodes. At this point there should be no dirty - * inode, and none should be pinned or locked, but use synchronous - * reclaim just to be sure. + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. */ + cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 37d1bbce047..9805cac81fc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1005,14 +1005,12 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_filestream_unmount(mp); - cancel_delayed_work_sync(&mp->m_sync_work); - xfs_unmountfs(mp); - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); cancel_work_sync(&mp->m_flush_work); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1325,6 +1323,9 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; sb->s_fs_info = mp; @@ -1388,15 +1389,9 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { @@ -1413,12 +1408,15 @@ xfs_fs_fill_super( goto out_unmount; } + /* + * The filesystem is successfully mounted, so we can start background + * sync work now. + */ + xfs_syncd_queue_sync(mp); + return 0; - out_syncd_stop: - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); + out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1437,10 +1435,6 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 7502f0621fb..a68761696ab 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -379,9 +379,9 @@ xfs_syncd_queue_sync( } /* - * Every sync period we need to unpin all items, reclaim inodes and sync - * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle and not frozen. + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle and not frozen. */ void xfs_sync_worker( @@ -391,17 +391,7 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - /* - * We shouldn't write/force the log if we are in the mount/unmount - * process or on a read only filesystem. The workqueue still needs to be - * active in both cases, however, because it is used for inode reclaim - * during these times. Use the MS_ACTIVE flag to avoid doing anything - * during mount. Doing work during unmount is avoided by calling - * cancel_delayed_work_sync on this work queue before tearing down - * the ail and the log in xfs_log_unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) @@ -409,8 +399,7 @@ xfs_sync_worker( else xfs_log_force(mp, 0); - /* start pushing all the metadata that is currently - * dirty */ + /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); } -- cgit v1.2.3 From 7f7bebefba152c5bdfe961cd2e97e8695a32998c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:01 +1100 Subject: xfs: don't run the sync work if the filesystem is read-only If the filesystem is mounted or remounted read-only, stop the sync worker that tries to flush or cover the log if the filesystem is dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw as necessary. This avoids the need for RO checks in the work. Similarly, stop the sync work when the filesystem is frozen, and start it again when the filesysetm is thawed. This avoids the need for special freeze checks in the work. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 2 ++ fs/xfs/xfs_sync.c | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9805cac81fc..20fa955d80d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1200,6 +1200,7 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); + xfs_syncd_queue_sync(mp); } /* rw -> ro */ @@ -1245,6 +1246,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); + xfs_syncd_queue_sync(mp); return 0; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index a68761696ab..e898d180704 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -323,6 +323,9 @@ xfs_quiesce_data( * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to * wait for any remaining transactions to drain out before proceeding. + * + * Note: this stops background sync work - the callers must ensure it is started + * again when appropriate. */ void xfs_quiesce_attr( @@ -341,6 +344,9 @@ xfs_quiesce_attr( /* flush all pending changes from the AIL */ xfs_ail_push_all_sync(mp->m_ail); + /* stop background sync work */ + cancel_delayed_work_sync(&mp->m_sync_work); + /* * Just warn here till VFS can correctly support * read-only remount without racing. @@ -379,9 +385,8 @@ xfs_syncd_queue_sync( } /* - * Every sync period we need to unpin all items in the AIL and push them to - * disk. If there is nothing dirty, then we might need to cover the log to - * indicate that the filesystem is idle and not frozen. + * Every sync period we need to push dirty metadata and try to cover the log + * to indicate the filesystem is idle and not frozen. */ void xfs_sync_worker( @@ -391,17 +396,15 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); + /* dgc: errors ignored here */ + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); - } + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); /* queue us up again */ xfs_syncd_queue_sync(mp); -- cgit v1.2.3 From f661f1e0bf5002bdcc8b5810ad0a184a1841537f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:02 +1100 Subject: xfs: sync work is now only periodic log work The only thing the periodic sync work does now is flush the AIL and idle the log. These are really functions of the log code, so move the work to xfs_log.c and rename it appropriately. The only wart that this leaves behind is the xfssyncd_centisecs sysctl, otherwise the xfssyncd is dead. Clean up any comments that related to xfssyncd to reflect it's passing. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 61 ++++++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_log.h | 3 +++ fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 16 ++++---------- fs/xfs/xfs_sync.c | 39 +++----------------------------- fs/xfs/xfs_sync.h | 2 -- 7 files changed, 62 insertions(+), 61 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e..efea12bfbd6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" kmem_zone_t *xfs_log_ticket_zone; @@ -679,25 +680,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -858,7 +863,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -1161,6 +1166,40 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, @@ -1195,6 +1234,7 @@ xlog_alloc_log( log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -3700,3 +3740,4 @@ xlog_iclogs_empty( } while (iclog != log->l_iclog); return 1; } + diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 748d312850e..26ed7de352d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,8 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); +void xfs_log_work_queue(struct xfs_mount *mp); +void xfs_log_worker(struct work_struct *work); + #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 18a801d76a4..9a4e0e5ec32 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -495,6 +495,7 @@ struct xlog { struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index deee09e534d..26e46aeaa3f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -197,7 +197,6 @@ typedef struct xfs_mount { struct mutex m_icsb_mutex; /* balancer sync lock */ #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ - struct delayed_work m_sync_work; /* background sync work */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 20fa955d80d..37c39a155a5 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1005,7 +1005,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - cancel_delayed_work_sync(&mp->m_sync_work); cancel_work_sync(&mp->m_flush_work); xfs_filestream_unmount(mp); @@ -1040,10 +1039,10 @@ xfs_fs_sync_fs( if (laptop_mode) { /* * The disk must be active because we're syncing. - * We schedule xfssyncd now (now that the disk is + * We schedule log work now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work(&mp->m_sync_work); + flush_delayed_work(&mp->m_log->l_work); } return 0; @@ -1200,7 +1199,7 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); - xfs_syncd_queue_sync(mp); + xfs_log_work_queue(mp); } /* rw -> ro */ @@ -1246,7 +1245,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); - xfs_syncd_queue_sync(mp); + xfs_log_work_queue(mp); return 0; } @@ -1326,7 +1325,6 @@ xfs_fs_fill_super( mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; @@ -1410,12 +1408,6 @@ xfs_fs_fill_super( goto out_unmount; } - /* - * The filesystem is successfully mounted, so we can start background - * sync work now. - */ - xfs_syncd_queue_sync(mp); - return 0; out_filestream_unmount: diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index e898d180704..2174555aebb 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -19,6 +19,7 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -344,8 +345,8 @@ xfs_quiesce_attr( /* flush all pending changes from the AIL */ xfs_ail_push_all_sync(mp->m_ail); - /* stop background sync work */ - cancel_delayed_work_sync(&mp->m_sync_work); + /* stop background log work */ + cancel_delayed_work_sync(&mp->m_log->l_work); /* * Just warn here till VFS can correctly support @@ -376,40 +377,6 @@ xfs_quiesce_attr( xfs_buf_unlock(mp->m_sb_bp); } -void -xfs_syncd_queue_sync( - struct xfs_mount *mp) -{ - queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, - msecs_to_jiffies(xfs_syncd_centisecs * 10)); -} - -/* - * Every sync period we need to push dirty metadata and try to cover the log - * to indicate the filesystem is idle and not frozen. - */ -void -xfs_sync_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_sync_work); - int error; - - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); - - /* queue us up again */ - xfs_syncd_queue_sync(mp); -} - /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 3f59e5bed66..8d58fab72a1 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,8 +26,6 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -void xfs_syncd_queue_sync(struct xfs_mount *mp); -void xfs_sync_worker(struct work_struct *work); void xfs_flush_worker(struct work_struct *work); void xfs_reclaim_worker(struct work_struct *work); -- cgit v1.2.3 From cf2931db2d189ce0583be7ae880d7e3f8c15f623 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:03 +1100 Subject: xfs: Bring some sanity to log unmounting When unmounting the filesystem, there are lots of operations that need to be done in a specific order, and they are spread across across a couple of functions. We have to drain the AIL before we write the unmount record, and we have to shut down the background log work before we do either of them. But this is all split haphazardly across xfs_unmountfs() and xfs_log_unmount(). Move all the AIL flushing and log manipulations to xfs_log_unmount() so that the responisbilities of each function is clear and the operations they perform obvious. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 29 ++++++++++++++++++++++++++--- fs/xfs/xfs_mount.c | 24 ------------------------ 2 files changed, 26 insertions(+), 27 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index efea12bfbd6..e788f39721e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,15 +855,38 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Shut down and release the AIL and Log. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record, tear down the AIL and finally free the log. */ void xfs_log_unmount(xfs_mount_t *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d9a31c6a0c5..c195ec85c72 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1459,13 +1459,6 @@ xfs_unmountfs( xfs_qm_unmount(mp); - /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1491,23 +1484,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); - - xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); -- cgit v1.2.3 From 9aa05000f2b7cab4be582afba64af10b2d74727e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:04 +1100 Subject: xfs: xfs_sync_data is redundant. We don't do any data writeback from XFS any more - the VFS is completely responsible for that, including for freeze. We can replace the remaining caller with a VFS level function that achieves the same thing, but without conflicting with current writeback work. This means we can remove the flush_work and xfs_flush_inodes() - the VFS functionality completely replaces the internal flush queue for doing this writeback work in a separate context to avoid stack overruns. This does have one complication - it cannot be called with page locks held. Hence move the flushing of delalloc space when ENOSPC occurs back up into xfs_file_aio_buffered_write when we don't hold any locks that will stall writeback. Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc conversion fast enough to prevent spurious ENOSPC whent here are hundreds of writers, thousands of small files and GBs of free RAM. Hence we need to use sync_sb_inodes() to block callers while we wait for writeback like the previous xfs_flush_inodes implementation did. That means we have to hold the s_umount lock here, but because this call can nest inside i_mutex (the parent directory in the create case, held by the VFS), we have to use down_read_trylock() to avoid potential deadlocks. In practice, this trylock will succeed on almost every attempt as unmount/remount type operations are exceedingly rare. Note: we always need to pass a count of zero to generic_file_buffered_write() as the previously written byte count. We only do this by accident before this patch by the virtue of ret always being zero when there are no errors. Make this explicit rather than needing to specifically zero ret in the ENOSPC retry case. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 13 +++++---- fs/xfs/xfs_iomap.c | 23 +++++---------- fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 21 ++++++++++++-- fs/xfs/xfs_super.h | 1 + fs/xfs/xfs_sync.c | 78 --------------------------------------------------- fs/xfs/xfs_sync.h | 3 -- fs/xfs/xfs_vnodeops.c | 2 +- 8 files changed, 34 insertions(+), 108 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aa473fa640a..daf4066c24b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -728,16 +728,17 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); + pos, &iocb->ki_pos, count, 0); + /* - * if we just got an ENOSPC, flush the inode now we aren't holding any - * page locks and retry *once* + * If we just got an ENOSPC, try to write back all dirty inodes to + * convert delalloc space to free up some of the excess reserved + * metadata space. */ if (ret == -ENOSPC && !enospc) { enospc = 1; - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (!ret) - goto write_retry; + xfs_flush_inodes(ip->i_mount); + goto write_retry; } current->backing_dev_info = NULL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad93..f858b903678 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -373,7 +373,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, flushed = 0; + int prealloc; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -434,26 +434,17 @@ retry: } /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For - * ENOSPC, * flush all other inodes with delalloc blocks to free up - * some of the excess reserved metadata space. For both cases, retry + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); - if (flushed) - return XFS_ERROR(error ? error : ENOSPC); - - if (error == ENOSPC) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; } - - flushed = 1; - error = 0; - prealloc = 0; - goto retry; + return XFS_ERROR(error ? error : ENOSPC); } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 26e46aeaa3f..a54b5aa498d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -198,7 +198,6 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 37c39a155a5..9468c687846 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -882,6 +882,24 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_unwritten_workqueue); } +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -1005,8 +1023,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - cancel_work_sync(&mp->m_flush_work); - xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1324,7 +1340,6 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 9de4a920ba0..bbe3d15a790 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -74,6 +74,7 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); +extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 2174555aebb..6a2ada37916 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -216,51 +216,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_inode_data( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct inode *inode = VFS_I(ip); - struct address_space *mapping = inode->i_mapping; - int error = 0; - - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (flags & SYNC_TRYLOCK) - return 0; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XBF_ASYNC, FI_NONE); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; -} - -/* - * Write out pagecache data for the whole filesystem. - */ -STATIC int -xfs_sync_data( - struct xfs_mount *mp, - int flags) -{ - int error; - - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); - if (error) - return XFS_ERROR(error); - - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return 0; -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -415,39 +370,6 @@ xfs_reclaim_worker( xfs_syncd_queue_reclaim(mp); } -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room. - * - * Queue a new data flush if there isn't one already in progress and - * wait for completion of the flush. This means that we only ever have one - * inode flush in progress no matter how many ENOSPC events are occurring and - * so will prevent the system from bogging down due to every concurrent - * ENOSPC event scanning all the active inodes in the system for writeback. - */ -void -xfs_flush_inodes( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - - queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work(&mp->m_flush_work); -} - -void -xfs_flush_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(work, - struct xfs_mount, m_flush_work); - - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 8d58fab72a1..0018e846f0d 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,14 +26,11 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -void xfs_flush_worker(struct work_struct *work); void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -void xfs_flush_inodes(struct xfs_inode *ip); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2a5c637344b..14928564f10 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -777,7 +777,7 @@ xfs_create( XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(dp); + xfs_flush_inodes(mp); error = xfs_trans_reserve(tp, resblks, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); } -- cgit v1.2.3 From 5889608df35783590251cfd440fa5d48f1855179 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:05 +1100 Subject: xfs: syncd workqueue is no more With the syncd functions moved to the log and/or removed, the syncd workqueue is the only remaining bit left. It is used by the log covering/ail pushing work, as well as by the inode reclaim work. Given how cheap workqueues are these days, give the log and inode reclaim work their own work queues and kill the syncd work queue. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_super.c | 38 ++++++++++++++++++-------------------- fs/xfs/xfs_sync.c | 20 +++++++++----------- fs/xfs/xfs_sync.h | 2 -- 5 files changed, 30 insertions(+), 34 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e788f39721e..b6ce4d4b6de 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1193,7 +1193,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a54b5aa498d..7c417b6b99e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,6 +207,8 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9468c687846..27d5a92e121 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -863,8 +863,23 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + return 0; +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -877,6 +892,8 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -1391,10 +1408,6 @@ xfs_fs_fill_super( /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. - * For the same reason we must also initialise the syncd and register - * the inode cache shrinker so that inodes can be reclaimed during - * operations like a quotacheck that iterate all inodes in the - * filesystem. */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; @@ -1638,16 +1651,6 @@ xfs_destroy_zones(void) STATIC int __init xfs_init_workqueues(void) { - /* - * We never want to the same work item to run twice, reclaiming inodes - * or idling the log is not going to get any faster by multiple CPUs - * competing for ressources. Use the default large max_active value - * so that even lots of filesystems can perform these task in parallel. - */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); - if (!xfs_syncd_wq) - return -ENOMEM; - /* * The allocation workqueue can be used in memory reclaim situations * (writepage path), and parallelism is only limited by the number of @@ -1656,20 +1659,15 @@ xfs_init_workqueues(void) */ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); if (!xfs_alloc_wq) - goto out_destroy_syncd; + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { destroy_workqueue(xfs_alloc_wq); - destroy_workqueue(xfs_syncd_wq); } STATIC int __init diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 6a2ada37916..15be21f074f 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -40,8 +40,6 @@ #include #include -struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -335,18 +333,18 @@ xfs_quiesce_attr( /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs syncd work default of 30s. Perhaps this should have it's own + * on the xfs periodic sync default of 30s. Perhaps this should have it's own * tunable, but that can be done if this method proves to be ineffective or too * aggressive. */ static void -xfs_syncd_queue_reclaim( +xfs_reclaim_work_queue( struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } rcu_read_unlock(); @@ -367,7 +365,7 @@ xfs_reclaim_worker( struct xfs_mount, m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); } void @@ -388,7 +386,7 @@ __xfs_inode_set_reclaim_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* schedule periodic background inode reclaim */ - xfs_syncd_queue_reclaim(ip->i_mount); + xfs_reclaim_work_queue(ip->i_mount); trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -646,9 +644,9 @@ out: /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. */ return 0; } @@ -804,7 +802,7 @@ xfs_reclaim_inodes_nr( int nr_to_scan) { /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0018e846f0d..0beabea99e7 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -24,8 +24,6 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ -extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); -- cgit v1.2.3 From 34061f5c420561dd42addd252811a1fa4b0ac69b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:06 +1100 Subject: xfs: xfs_sync_fsdata is redundant Why do we need to write the superblock to disk once we've written all the data? We don't actually - the reasons for doing this are lost in the mists of time, and go back to the way Irix used to drive VFS flushing. On linux, this code is only called from two contexts: remount and .sync_fs. In the remount case, the call is followed by a metadata sync, which unpins and writes the superblock. In the sync_fs case, we only need to force the log to disk to ensure that the superblock is correctly on disk, so we don't actually need to write it. Hence the functionality is either redundant or superfluous and thus can be removed. Seeing as xfs_quiesce_data is essentially now just a log force, remove it as well and fold the code back into the two callers. Neither of them need the log covering check, either, as that is redundant for the remount case, and unnecessary for the .sync_fs case. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 19 +++++----------- fs/xfs/xfs_sync.c | 67 +++++++----------------------------------------------- 2 files changed, 14 insertions(+), 72 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27d5a92e121..b5e445a13f7 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1057,7 +1057,6 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); - int error; /* * Doing anything during the async pass would be counterproductive. @@ -1065,10 +1064,7 @@ xfs_fs_sync_fs( if (!wait) return 0; - error = xfs_quiesce_data(mp); - if (error) - return -error; - + xfs_log_force(mp, XFS_LOG_SYNC); if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1238,15 +1234,12 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { /* - * After we have synced the data but before we sync the - * metadata, we need to free up the reserve block pool so that - * the used block count in the superblock on disk is correct at - * the end of the remount. Stash the current reserve pool size - * so that if we get remounted rw, we can return it to the same - * size. + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. */ - - xfs_quiesce_data(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 15be21f074f..581eb59a85b 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -214,70 +214,16 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_fsdata( - struct xfs_mount *mp) -{ - struct xfs_buf *bp; - int error; - - /* - * If the buffer is pinned then push on the log so we won't get stuck - * waiting in the write for someone, maybe ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have the - * superblock buffer locked at that point so it can become pinned in - * between there and here. - */ - bp = xfs_getsb(mp, 0); - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - return error; -} - -/* - * When remounting a filesystem read-only or freezing the filesystem, we have - * two phases to execute. This first phase is syncing the data before we - * quiesce the filesystem, and the second is flushing all the inodes out after - * we've waited for all the transactions created by the first phase to - * complete. The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - */ -/* - * First stage of freeze - no writers will make progress now we are here, - * so we flush delwri and delalloc buffers here, then wait for all I/O to - * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother emptying the AIL - * because it'll just get dirty again. - */ -int -xfs_quiesce_data( - struct xfs_mount *mp) -{ - int error, error2 = 0; - - /* force out the log */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp); - - /* mark the log as covered if needed */ - if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp); - - return error ? error : error2; -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to * wait for any remaining transactions to drain out before proceeding. * + * The second phase ensures that the inodes are written to their + * location on disk rather than just existing in transactions in the log. This + * means after a quiesce there is no log replay required to write the inodes to + * disk (this is the main difference between a sync and a quiesce). + * * Note: this stops background sync work - the callers must ensure it is started * again when appropriate. */ @@ -291,6 +237,9 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + /* reclaim inodes to do any IO before the freeze completes */ xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); -- cgit v1.2.3 From c7eea6f7adca4501d2c2db7f0f7c9dc88efac95e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:07 +1100 Subject: xfs: move xfs_quiesce_attr() into xfs_super.c Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's nothing really sync-specific about this functionality so it doesn't really matter where it lives. Move it to benext to it's callers, so all the remount/sync_fs code is in the one place. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sync.c | 65 ---------------------------------------------------- fs/xfs/xfs_sync.h | 3 --- 3 files changed, 67 insertions(+), 68 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b5e445a13f7..3bafe66227f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1148,6 +1148,73 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk (this is the main + * difference between a sync and a quiesce). + * + * This shoul deffectively mimic the code in xfs_unmountfs() and + * xfs_log_umount() but without tearing down any structures. + * XXX: bug fixes needed! + * + * Note: this stops background log work - the callers must ensure it is started + * again when appropriate. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); + + /* stop background log work */ + cancel_delayed_work_sync(&mp->m_log->l_work); + + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + xfs_log_unmount_write(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); +} + STATIC int xfs_fs_remount( struct super_block *sb, diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 581eb59a85b..7b630288bab 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -214,71 +214,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -/* - * Second stage of a quiesce. The data is already synced, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceeding. - * - * The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - * - * Note: this stops background sync work - the callers must ensure it is started - * again when appropriate. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* force the log to unpin objects from the now complete transactions */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); - - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); -} - /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0beabea99e7..0ba9c89c316 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,9 +26,6 @@ struct xfs_perag; void xfs_reclaim_worker(struct work_struct *work); -int xfs_quiesce_data(struct xfs_mount *mp); -void xfs_quiesce_attr(struct xfs_mount *mp); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); -- cgit v1.2.3 From c75921a72a7c4bb73a5e09a697a672722e5543f1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:08 +1100 Subject: xfs: xfs_quiesce_attr() should quiesce the log like unmount xfs_quiesce_attr() is supposed to leave the log empty with an unmount record written. Right now it does not wait for the AIL to be emptied before writing the unmount record, not does it wait for metadata IO completion, either. Fix it to use the same method and code as xfs_log_unmount(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 25 ++++++++++++++++++------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_super.c | 41 ++++++++--------------------------------- 3 files changed, 27 insertions(+), 40 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b6ce4d4b6de..d2d59692739 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,20 +855,17 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Shut down and release the AIL and Log. - * - * During unmount, we need to ensure we flush all the dirty metadata objects - * from the AIL so that the log is empty before we write the unmount record to - * the log. + * Empty the log for unmount/freeze. * * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and * run the callbacks removing themselves from the AIL, we can write the unmount - * record, tear down the AIL and finally free the log. + * record. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -886,6 +883,20 @@ xfs_log_unmount(xfs_mount_t *mp) xfs_buf_unlock(mp->m_sb_bp); xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 26ed7de352d..5caee96059d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -183,6 +183,7 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3bafe66227f..fdedf2cabae 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1153,15 +1153,11 @@ xfs_restore_resvblks(struct xfs_mount *mp) * * This ensures that the metadata is written to their location on disk rather * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk (this is the main - * difference between a sync and a quiesce). + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. * - * This shoul deffectively mimic the code in xfs_unmountfs() and - * xfs_log_umount() but without tearing down any structures. - * XXX: bug fixes needed! - * - * Note: this stops background log work - the callers must ensure it is started - * again when appropriate. + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. */ void xfs_quiesce_attr( @@ -1180,39 +1176,18 @@ xfs_quiesce_attr( xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp); if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. + * Just warn here till VFS can correctly support + * read-only remount without racing. */ - xfs_ail_push_all_sync(mp->m_ail); + WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); + xfs_log_quiesce(mp); } STATIC int -- cgit v1.2.3 From 6d8b79cfca39399ef9115fb65dde85993455c9a3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:09 +1100 Subject: xfs: rename xfs_sync.[ch] to xfs_icache.[ch] xfs_sync.c now only contains inode reclaim functions and inode cache iteration functions. It is not related to sync operations anymore. Rename to xfs_icache.c to reflect it's contents and prepare for consolidation with the other inode cache file that exists (xfs_iget.c). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Makefile | 2 +- fs/xfs/xfs_icache.c | 715 +++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 43 +++ fs/xfs/xfs_iget.c | 1 + fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 2 - fs/xfs/xfs_qm_syscalls.c | 1 + fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_sync.c | 714 ---------------------------------------------- fs/xfs/xfs_sync.h | 43 --- 10 files changed, 763 insertions(+), 761 deletions(-) create mode 100644 fs/xfs/xfs_icache.c create mode 100644 fs/xfs/xfs_icache.h delete mode 100644 fs/xfs/xfs_sync.c delete mode 100644 fs/xfs/xfs_sync.h (limited to 'fs/xfs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d2bf974b1a2..442f256dbca 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -39,6 +39,7 @@ xfs-y += xfs_aops.o \ xfs_fsops.o \ xfs_fs_subr.o \ xfs_globals.o \ + xfs_icache.o \ xfs_iget.o \ xfs_ioctl.o \ xfs_iomap.o \ @@ -47,7 +48,6 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mru_cache.o \ xfs_super.o \ - xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ xfs_utils.o \ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c new file mode 100644 index 00000000000..eba216f11d5 --- /dev/null +++ b/fs/xfs/xfs_icache.c @@ -0,0 +1,715 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_icache.h" + +#include +#include + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EFSCORRUPTED; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return ENOENT; + + if (is_bad_inode(inode)) { + IRELE(ip); + return ENOENT; + } + + /* inode is valid */ + return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ENOENT; +} + +STATIC int +xfs_inode_ag_walk( + struct xfs_mount *mp, + struct xfs_perag *pag, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags) +{ + uint32_t first_index; + int last_error = 0; + int skipped; + int done; + int nr_found; + +restart: + done = 0; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], pag, flags); + IRELE(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags), + int flags) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(ip->i_mount); + + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +__xfs_inode_clear_reclaim( + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } +} + +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; + + /* + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. + */ + if ((flags & SYNC_TRYLOCK) && + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) + return 1; + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. + */ + spin_lock(&ip->i_flags_lock); + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, async - requeue + * dirty, sync 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. + * + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, async => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, async => requeue + * dirty, sync => flush, wait and reclaim + */ +STATIC int +xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int sync_mode) +{ + struct xfs_buf *bp = NULL; + int error; + +restart: + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (is_bad_inode(VFS_I(ip))) + goto reclaim; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* + * Now we have an inode that needs flushing. + * + * Note that xfs_iflush will never block on the inode buffer lock, as + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. + */ + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } + + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + + xfs_iflock(ip); +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + spin_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); + spin_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_inode_free(ip); + return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. + */ + return 0; +} + +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + xfs_perag_put(pag); + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + done = 1; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + cond_resched(); + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return XFS_ERROR(last_error); +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) +{ + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); +} + +/* + * Scan a certain number of inodes for reclaim. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. + */ +void +xfs_reclaim_inodes_nr( + struct xfs_mount *mp, + int nr_to_scan) +{ + /* kick background reclaimer and push the AIL */ + xfs_reclaim_work_queue(mp); + xfs_ail_push_all(mp->m_ail); + + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); +} + +/* + * Return the number of reclaimable inodes in the filesystem for + * the shrinker to determine how much to reclaim. + */ +int +xfs_reclaim_inodes_count( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t ag = 0; + int reclaimable = 0; + + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h new file mode 100644 index 00000000000..0ba9c89c316 --- /dev/null +++ b/fs/xfs/xfs_icache.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; +struct xfs_perag; + +#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ + +void xfs_reclaim_worker(struct work_struct *work); + +int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +int xfs_reclaim_inodes_count(struct xfs_mount *mp); +void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); +void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, + struct xfs_inode *ip); + +int xfs_sync_inode_grab(struct xfs_inode *ip); +int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), + int flags); + +#endif diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 784a803383e..069c5ceb945 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -38,6 +38,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c195ec85c72..6f1c997704c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -42,6 +42,7 @@ #include "xfs_fsops.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" #ifdef HAVE_PERCPU_SB diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7c417b6b99e..a631ca3b906 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations { #else /* __KERNEL__ */ -#include "xfs_sync.h" - struct xlog; struct xfs_inode; struct xfs_mru_cache; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 858a3b18611..7a9071f8855 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fdedf2cabae..3d9ea947e9f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,7 +49,7 @@ #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" -#include "xfs_sync.h" +#include "xfs_icache.h" #include "xfs_trace.h" #include diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c deleted file mode 100644 index 7b630288bab..00000000000 --- a/fs/xfs/xfs_sync.c +++ /dev/null @@ -1,714 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_log_priv.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_dinode.h" -#include "xfs_error.h" -#include "xfs_filestream.h" -#include "xfs_vnodeops.h" -#include "xfs_inode_item.h" -#include "xfs_quota.h" -#include "xfs_trace.h" -#include "xfs_fsops.h" - -#include -#include - -/* - * The inode lookup is done in batches to keep the amount of lock traffic and - * radix tree lookups to a minimum. The batch size is a trade off between - * lookup reduction and stack usage. This is in the reclaim path, so we can't - * be too greedy. - */ -#define XFS_LOOKUP_BATCH 32 - -STATIC int -xfs_inode_ag_walk_grab( - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(rcu_read_lock_held()); - - /* - * check for stale RCU freed inode - * - * If the inode has been reallocated, it doesn't matter if it's not in - * the AG we are walking - we are walking for writeback, so if it - * passes all the "valid inode" checks and is dirty, then we'll write - * it back anyway. If it has been reallocated and still being - * initialised, the XFS_INEW check below will catch it. - */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock_noent; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock_noent; - spin_unlock(&ip->i_flags_lock); - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return EFSCORRUPTED; - - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - return ENOENT; - - if (is_bad_inode(inode)) { - IRELE(ip); - return ENOENT; - } - - /* inode is valid */ - return 0; - -out_unlock_noent: - spin_unlock(&ip->i_flags_lock); - return ENOENT; -} - -STATIC int -xfs_inode_ag_walk( - struct xfs_mount *mp, - struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) -{ - uint32_t first_index; - int last_error = 0; - int skipped; - int done; - int nr_found; - -restart: - done = 0; - skipped = 0; - first_index = 0; - nr_found = 0; - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int error = 0; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - if (!nr_found) { - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || xfs_inode_ag_walk_grab(ip)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can occur if - * we have inodes in the last block of the AG and we - * are currently pointing to the last inode. - * - * Because we may see inodes that are from the wrong AG - * due to RCU freeing and reallocation, only update the - * index if it lies in this AG. It was a race that lead - * us to see this inode, so another lookup from the - * same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - error = execute(batch[i], pag, flags); - IRELE(batch[i]); - if (error == EAGAIN) { - skipped++; - continue; - } - if (error && last_error != EFSCORRUPTED) - last_error = error; - } - - /* bail out if the filesystem is corrupted. */ - if (error == EFSCORRUPTED) - break; - - cond_resched(); - - } while (nr_found && !done); - - if (skipped) { - delay(1); - goto restart; - } - return last_error; -} - -int -xfs_inode_ag_iterator( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get(mp, ag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == EFSCORRUPTED) - break; - } - } - return XFS_ERROR(last_error); -} - -/* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. - */ -static void -xfs_reclaim_work_queue( - struct xfs_mount *mp) -{ - - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, - msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); - } - rcu_read_unlock(); -} - -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - -void -__xfs_inode_set_reclaim_tag( - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - - if (!pag->pag_ici_reclaimable) { - /* propagate the reclaim tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(ip->i_mount); - - trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } - pag->pag_ici_reclaimable++; -} - -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_set_reclaim_tag( - xfs_inode_t *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_set_reclaim_tag(pag, ip); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - -STATIC void -__xfs_inode_clear_reclaim( - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - pag->pag_ici_reclaimable--; - if (!pag->pag_ici_reclaimable) { - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } -} - -void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_inode_clear_reclaim(pag, ip); -} - -/* - * Grab the inode for reclaim exclusively. - * Return 0 if we grabbed it, non-zero otherwise. - */ -STATIC int -xfs_reclaim_inode_grab( - struct xfs_inode *ip, - int flags) -{ - ASSERT(rcu_read_lock_held()); - - /* quick check for stale RCU freed inode */ - if (!ip->i_ino) - return 1; - - /* - * If we are asked for non-blocking operation, do unlocked checks to - * see if the inode already is being flushed or in reclaim to avoid - * lock traffic. - */ - if ((flags & SYNC_TRYLOCK) && - __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) - return 1; - - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - * - * Due to RCU lookup, we may find inodes that have been freed and only - * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that - * aren't candidates for reclaim at all, so we must check the - * XFS_IRECLAIMABLE is set first before proceeding to reclaim. - */ - spin_lock(&ip->i_flags_lock); - if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || - __xfs_iflags_test(ip, XFS_IRECLAIM)) { - /* not a reclaim candidate. */ - spin_unlock(&ip->i_flags_lock); - return 1; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - return 0; -} - -/* - * Inodes in different states need to be treated differently. The following - * table lists the inode states and the reclaim actions necessary: - * - * inode state iflush ret required action - * --------------- ---------- --------------- - * bad - reclaim - * shutdown EIO unpin and reclaim - * clean, unpinned 0 reclaim - * stale, unpinned 0 reclaim - * clean, pinned(*) 0 requeue - * stale, pinned EAGAIN requeue - * dirty, async - requeue - * dirty, sync 0 reclaim - * - * (*) dgc: I don't think the clean, pinned state is possible but it gets - * handled anyway given the order of checks implemented. - * - * Also, because we get the flush lock first, we know that any inode that has - * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. - * - * Note that because the inode is flushed delayed write by AIL pushing, the - * flush lock may already be held here and waiting on it can result in very - * long latencies. Hence for sync reclaims, where we wait on the flush lock, - * the caller should push the AIL first before trying to reclaim inodes to - * minimise the amount of time spent waiting. For background relaim, we only - * bother to reclaim clean inodes anyway. - * - * Hence the order of actions after gaining the locks should be: - * bad => reclaim - * shutdown => unpin and reclaim - * pinned, async => requeue - * pinned, sync => unpin - * stale => reclaim - * clean => reclaim - * dirty, async => requeue - * dirty, sync => flush, wait and reclaim - */ -STATIC int -xfs_reclaim_inode( - struct xfs_inode *ip, - struct xfs_perag *pag, - int sync_mode) -{ - struct xfs_buf *bp = NULL; - int error; - -restart: - error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!xfs_iflock_nowait(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out; - xfs_iflock(ip); - } - - if (is_bad_inode(VFS_I(ip))) - goto reclaim; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_iunpin_wait(ip); - xfs_iflush_abort(ip, false); - goto reclaim; - } - if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - xfs_iunpin_wait(ip); - } - if (xfs_iflags_test(ip, XFS_ISTALE)) - goto reclaim; - if (xfs_inode_clean(ip)) - goto reclaim; - - /* - * Never flush out dirty data during non-blocking reclaim, as it would - * just contend with AIL pushing trying to do the same job. - */ - if (!(sync_mode & SYNC_WAIT)) - goto out_ifunlock; - - /* - * Now we have an inode that needs flushing. - * - * Note that xfs_iflush will never block on the inode buffer lock, as - * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_imap_to_bp() to get the cluster buffer would - * result in an ABBA deadlock with xfs_ifree_cluster(). - * - * As xfs_ifree_cluser() must gather all inodes that are active in the - * cache to mark them stale, if we hit this case we don't actually want - * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error here, just unlock the - * inode, back off and try again. Hopefully the next pass through will - * see the stale flag set on the inode. - */ - error = xfs_iflush(ip, &bp); - if (error == EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - - if (!error) { - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - } - - xfs_iflock(ip); -reclaim: - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - XFS_STATS_INC(xs_ig_reclaims); - /* - * Remove the inode from the per-AG radix tree. - * - * Because radix_tree_delete won't complain even if the item was never - * added to the tree assert that it's been there before to catch - * problems with the inode life time early on. - */ - spin_lock(&pag->pag_ici_lock); - if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) - ASSERT(0); - __xfs_inode_clear_reclaim(pag, ip); - spin_unlock(&pag->pag_ici_lock); - - /* - * Here we do an (almost) spurious inode lock in order to coordinate - * with inode cache radix tree lookups. This is because the lookup - * can reference the inodes in the cache without taking references. - * - * We make that OK here by ensuring that we wait until the inode is - * unlocked after the lookup before we go ahead and free it. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - xfs_inode_free(ip); - return error; - -out_ifunlock: - xfs_ifunlock(ip); -out: - xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and the reclaim work never goes back to - * the idle state. Instead, return 0 to let the next scheduled - * background reclaim attempt to reclaim the inode again. - */ - return 0; -} - -/* - * Walk the AGs and reclaim the inodes in them. Even if the filesystem is - * corrupted, we still want to try to reclaim all the inodes. If we don't, - * then a shut down during filesystem unmount reclaim walk leak all the - * unreclaimed inodes. - */ -int -xfs_reclaim_inodes_ag( - struct xfs_mount *mp, - int flags, - int *nr_to_scan) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - int trylock = flags & SYNC_TRYLOCK; - int skipped; - -restart: - ag = 0; - skipped = 0; - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - unsigned long first_index = 0; - int done = 0; - int nr_found = 0; - - ag = pag->pag_agno + 1; - - if (trylock) { - if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { - skipped++; - xfs_perag_put(pag); - continue; - } - first_index = pag->pag_ici_reclaim_cursor; - } else - mutex_lock(&pag->pag_ici_reclaim_lock); - - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH, - XFS_ICI_RECLAIM_TAG); - if (!nr_found) { - done = 1; - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || xfs_reclaim_inode_grab(ip, flags)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can - * occur if we have inodes in the last block of - * the AG and we are currently pointing to the - * last inode. - * - * Because we may see inodes that are from the - * wrong AG due to RCU freeing and - * reallocation, only update the index if it - * lies in this AG. It was a race that lead us - * to see this inode, so another lookup from - * the same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != - pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - error = xfs_reclaim_inode(batch[i], pag, flags); - if (error && last_error != EFSCORRUPTED) - last_error = error; - } - - *nr_to_scan -= XFS_LOOKUP_BATCH; - - cond_resched(); - - } while (nr_found && !done && *nr_to_scan > 0); - - if (trylock && !done) - pag->pag_ici_reclaim_cursor = first_index; - else - pag->pag_ici_reclaim_cursor = 0; - mutex_unlock(&pag->pag_ici_reclaim_lock); - xfs_perag_put(pag); - } - - /* - * if we skipped any AG, and we still have scan count remaining, do - * another pass this time using blocking reclaim semantics (i.e - * waiting on the reclaim locks and ignoring the reclaim cursors). This - * ensure that when we get more reclaimers than AGs we block rather - * than spin trying to execute reclaim. - */ - if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { - trylock = 0; - goto restart; - } - return XFS_ERROR(last_error); -} - -int -xfs_reclaim_inodes( - xfs_mount_t *mp, - int mode) -{ - int nr_to_scan = INT_MAX; - - return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); -} - -/* - * Scan a certain number of inodes for reclaim. - * - * When called we make sure that there is a background (fast) inode reclaim in - * progress, while we will throttle the speed of reclaim via doing synchronous - * reclaim of inodes. That means if we come across dirty inodes, we wait for - * them to be cleaned, which we hope will not be very long due to the - * background walker having already kicked the IO off on those dirty inodes. - */ -void -xfs_reclaim_inodes_nr( - struct xfs_mount *mp, - int nr_to_scan) -{ - /* kick background reclaimer and push the AIL */ - xfs_reclaim_work_queue(mp); - xfs_ail_push_all(mp->m_ail); - - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); -} - -/* - * Return the number of reclaimable inodes in the filesystem for - * the shrinker to determine how much to reclaim. - */ -int -xfs_reclaim_inodes_count( - struct xfs_mount *mp) -{ - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; - int reclaimable = 0; - - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - ag = pag->pag_agno + 1; - reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); - } - return reclaimable; -} - diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h deleted file mode 100644 index 0ba9c89c316..00000000000 --- a/fs/xfs/xfs_sync.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef XFS_SYNC_H -#define XFS_SYNC_H 1 - -struct xfs_mount; -struct xfs_perag; - -#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ -#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ - -void xfs_reclaim_worker(struct work_struct *work); - -int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); -int xfs_reclaim_inodes_count(struct xfs_mount *mp); -void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); - -void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); - -int xfs_sync_inode_grab(struct xfs_inode *ip); -int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); - -#endif -- cgit v1.2.3 From fa96acadf1eb712fca6d59922ad93787c87e44ec Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:10 +1100 Subject: xfs: move inode locking functions to xfs_inode.c xfs_ilock() and friends really aren't related to the inode cache in any way, so move them to xfs_inode.c with all the other inode related functionality. While doing this move, move the xfs_ilock() tracepoints to *before* the lock is taken so that when a hang on a lock occurs we have events to indicate which process and what inode we were trying to lock when the hang occurred. This is much better than the current silence we get on a hang... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_iget.c | 251 ----------------------------------------------------- fs/xfs/xfs_inode.c | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 251 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 069c5ceb945..ea9a5fa49a4 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -453,254 +453,3 @@ out_error_or_again: return error; } -/* - * This is a wrapper routine around the xfs_ilock() routine - * used to centralize some grungy code. It is used in places - * that wish to lock the inode solely for reading the extents. - * The reason these places can't just call xfs_ilock(SHARED) - * is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode - * is in b-tree format, then we need to lock the inode exclusively - * until the extents are read in. Locking it exclusively all - * the time would limit our parallelism unnecessarily, though. - * What we do instead is check to see if the extents have been - * read in yet, and only lock the inode exclusively if they - * have not. - * - * The function returns a value which should be given to the - * corresponding xfs_iunlock_map_shared(). This value is - * the mode in which the lock was actually taken. - */ -uint -xfs_ilock_map_shared( - xfs_inode_t *ip) -{ - uint lock_mode; - - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { - lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - - xfs_ilock(ip, lock_mode); - - return lock_mode; -} - -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) -{ - xfs_iunlock(ip, lock_mode); -} - -/* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL - */ -void -xfs_ilock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - - if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - - trace_xfs_ilock(ip, lock_flags, _RET_IP_); -} - -/* - * This is just like xfs_ilock(), except that the caller - * is guaranteed not to sleep. It returns 1 if it gets - * the requested locks and 0 otherwise. If the IO lock is - * obtained but the inode lock cannot be, then the IO lock - * is dropped before returning. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks to be - * to be locked. See the comment for xfs_ilock() for a list - * of valid values. - */ -int -xfs_ilock_nowait( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) - goto out; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) - goto out; - } - if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; - } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; - } - trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - return 1; - - out_undo_iolock: - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - out: - return 0; -} - -/* - * xfs_iunlock() is used to drop the inode locks acquired with - * xfs_ilock() and xfs_ilock_nowait(). The caller must pass - * in the flags given to xfs_ilock() or xfs_ilock_nowait() so - * that we know which locks to drop. - * - * ip -- the inode being unlocked - * lock_flags -- this parameter indicates the inode's locks to be - * to be unlocked. See the comment for xfs_ilock() for a list - * of valid values for this parameter. - * - */ -void -xfs_iunlock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - ASSERT(lock_flags != 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - - if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); - else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); - - trace_xfs_iunlock(ip, lock_flags, _RET_IP_); -} - -/* - * give up write locks. the i/o lock cannot be held nested - * if it is being demoted. - */ -void -xfs_ilock_demote( - xfs_inode_t *ip, - uint lock_flags) -{ - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - - if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); - if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); - - trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); -} - -#ifdef DEBUG -int -xfs_isilocked( - xfs_inode_t *ip, - uint lock_flags) -{ - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } - - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); - } - - ASSERT(0); - return 0; -} -#endif - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2778258fcfa..ba404e4b9f0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -74,6 +74,256 @@ xfs_get_extsz_hint( return 0; } +/* + * This is a wrapper routine around the xfs_ilock() routine used to centralize + * some grungy code. It is used in places that wish to lock the inode solely + * for reading the extents. The reason these places can't just call + * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode is in b-tree + * format, then we need to lock the inode exclusively until the extents are read + * in. Locking it exclusively all the time would limit our parallelism + * unnecessarily, though. What we do instead is check to see if the extents + * have been read in yet, and only lock the inode exclusively if they have not. + * + * The function returns a value which should be given to the corresponding + * xfs_iunlock_map_shared(). This value is the mode in which the lock was + * actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#ifdef DEBUG +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer -- cgit v1.2.3 From 33479e0542df066fb0b47df18780e93bfe6e0dc5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:11 +1100 Subject: xfs: remove xfs_iget.c The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c along with the other inode cache functions. This removes all functionality from xfs_iget.c, so the file can simply be removed. This move results in various functions now only having the scope of a single file (e.g. xfs_inode_free()), so clean up all the definitions and exported prototypes in xfs_icache.[ch] and xfs_inode.h appropriately. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Makefile | 1 - fs/xfs/xfs_export.c | 1 + fs/xfs/xfs_icache.c | 421 ++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_icache.h | 6 +- fs/xfs/xfs_iget.c | 455 ----------------------------------------------- fs/xfs/xfs_inode.c | 1 + fs/xfs/xfs_inode.h | 10 +- fs/xfs/xfs_itable.c | 1 + fs/xfs/xfs_log_recover.c | 1 + fs/xfs/xfs_qm.c | 1 + fs/xfs/xfs_rtalloc.c | 1 + fs/xfs/xfs_vnodeops.c | 1 + 12 files changed, 430 insertions(+), 470 deletions(-) delete mode 100644 fs/xfs/xfs_iget.c (limited to 'fs/xfs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 442f256dbca..e65357bb3dc 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -40,7 +40,6 @@ xfs-y += xfs_aops.o \ xfs_fs_subr.o \ xfs_globals.o \ xfs_icache.o \ - xfs_iget.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 8c6d1d70278..a83611849ce 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * Note that we only accept fileids which are long enough rather than allow diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eba216f11d5..9c8703b5cd7 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -41,6 +41,421 @@ #include #include +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -253,7 +668,7 @@ xfs_reclaim_worker( xfs_reclaim_work_queue(mp); } -void +static void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, struct xfs_inode *ip) @@ -319,7 +734,7 @@ __xfs_inode_clear_reclaim( } } -void +STATIC void __xfs_inode_clear_reclaim_tag( xfs_mount_t *mp, xfs_perag_t *pag, @@ -542,7 +957,7 @@ out: * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. */ -int +STATIC int xfs_reclaim_inodes_ag( struct xfs_mount *mp, int flags, diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 0ba9c89c316..222e22f16b4 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -24,6 +24,9 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); @@ -31,9 +34,6 @@ int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c deleted file mode 100644 index ea9a5fa49a4..00000000000 --- a/fs/xfs/xfs_iget.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_priv.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_trace.h" -#include "xfs_icache.h" - - -/* - * Allocate and initialise an xfs_inode. - */ -STATIC struct xfs_inode * -xfs_inode_alloc( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - struct xfs_inode *ip; - - /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. - */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); - if (!ip) - return NULL; - if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - ASSERT(ip->i_ino == 0); - - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - /* initialise the xfs inode */ - ip->i_ino = ino; - ip->i_mount = mp; - memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; - memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); - ip->i_flags = 0; - ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - - return ip; -} - -STATIC void -xfs_inode_free_callback( - struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct xfs_inode *ip = XFS_I(inode); - - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - - /* - * Because we use RCU freeing we need to ensure the inode always - * appears to be reclaimed with an invalid inode number when in the - * free state. The ip->i_flags_lock provides the barrier against lookup - * races. - */ - spin_lock(&ip->i_flags_lock); - ip->i_flags = XFS_IRECLAIM; - ip->i_ino = 0; - spin_unlock(&ip->i_flags_lock); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); -} - -/* - * Check the validity of the inode we just found it the cache - */ -static int -xfs_iget_cache_hit( - struct xfs_perag *pag, - struct xfs_inode *ip, - xfs_ino_t ino, - int flags, - int lock_flags) __releases(RCU) -{ - struct inode *inode = VFS_I(ip); - struct xfs_mount *mp = ip->i_mount; - int error; - - /* - * check for re-use of an inode within an RCU grace period due to the - * radix tree nodes not being updated yet. We monitor for this by - * setting the inode number to zero before freeing the inode structure. - * If the inode has been reallocated and set up, then the inode number - * will not match, so check for that, too. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - - /* - * If we are racing with another cache hit that is currently - * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete - * before continuing. - * - * XXX(hch): eventually we should do something equivalent to - * wait_on_inode to wait for these flags to be cleared - * instead of polling for it. - */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - /* - * If lookup is racing with unlink return an error immediately. - */ - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } - - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ - if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - error = -inode_init_always(mp->m_super, inode); - if (error) { - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - inode->i_state = I_NEW; - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - } else { - /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = EAGAIN; - goto out_error; - } - - /* We've got a live one. */ - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - trace_xfs_iget_hit(ip); - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); - - return 0; - -out_error: - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return error; -} - - -static int -xfs_iget_cache_miss( - struct xfs_mount *mp, - struct xfs_perag *pag, - xfs_trans_t *tp, - xfs_ino_t ino, - struct xfs_inode **ipp, - int flags, - int lock_flags) -{ - struct xfs_inode *ip; - int error; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; - - ip = xfs_inode_alloc(mp, ino); - if (!ip) - return ENOMEM; - - error = xfs_iread(mp, tp, ip, flags); - if (error) - goto out_destroy; - - trace_xfs_iget_miss(ip); - - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_destroy; - } - - /* - * Preload the radix tree so we can insert safely under the - * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. - */ - if (radix_tree_preload(GFP_NOFS)) { - error = EAGAIN; - goto out_destroy; - } - - /* - * Because the inode hasn't been added to the radix-tree yet it can't - * be found by another thread, so we can do the non-sleeping lock here. - */ - if (lock_flags) { - if (!xfs_ilock_nowait(ip, lock_flags)) - BUG(); - } - - /* - * These values must be set before inserting the inode into the radix - * tree as the moment it is inserted a concurrent lookup (allowed by the - * RCU locking mechanism) can find it and that lookup must see that this - * is an inode currently under construction (i.e. that XFS_INEW is set). - * The ip->i_flags_lock that protects the XFS_INEW flag forms the - * memory barrier that ensures this detection works correctly at lookup - * time. - */ - iflags = XFS_INEW; - if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, iflags); - - /* insert the new inode */ - spin_lock(&pag->pag_ici_lock); - error = radix_tree_insert(&pag->pag_ici_root, agino, ip); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); - error = EAGAIN; - goto out_preload_end; - } - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - - *ipp = ip; - return 0; - -out_preload_end: - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - if (lock_flags) - xfs_iunlock(ip, lock_flags); -out_destroy: - __destroy_inode(VFS_I(ip)); - xfs_inode_free(ip); - return error; -} - -/* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. - * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - */ -int -xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) -{ - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; - - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ - ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); - - /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) - return EINVAL; - - /* get the perag structure and ensure that it's inode capable */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); - agino = XFS_INO_TO_AGINO(mp, ino); - -again: - error = 0; - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, agino); - - if (ip) { - error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); - if (error) - goto out_error_or_again; - } else { - rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); - - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, - flags, lock_flags); - if (error) - goto out_error_or_again; - } - xfs_perag_put(pag); - - *ipp = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); - return 0; - -out_error_or_again: - if (error == EAGAIN) { - delay(1); - goto again; - } - xfs_perag_put(pag); - return error; -} - diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ba404e4b9f0..bba8f37525b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -45,6 +45,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 94b32f906e7..1fc2065e010 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) + /* - * xfs_iget.c prototypes. + * xfs_inode.c prototypes. */ -int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_inode_free(struct xfs_inode *ip); - -/* - * xfs_inode.c prototypes. - */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 01d10a66e30..3998fd2a794 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_internal_inum( diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5da3ace352b..651c98859b0 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -42,6 +42,7 @@ #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xlog_find_zeroed( diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 2e86fa0cfc0..48c750b0e83 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The global quota manager. There is only one of these for the entire diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca28a4ba4b5..a69e0b4750a 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -38,6 +38,7 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" +#include "xfs_icache.h" /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 14928564f10..2ee1f49da0a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -47,6 +47,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The maximum pathlen is 1024 bytes. Since the minimum file system -- cgit v1.2.3 From d35e88faa3b0fc2cea35c3b2dca358b5cd09b45f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:12 +1100 Subject: xfs: only update the last_sync_lsn when a transaction completes The log write code stamps each iclog with the current tail LSN in the iclog header so that recovery knows where to find the tail of thelog once it has found the head. Normally this is taken from the first item on the AIL - the log item that corresponds to the oldest active item in the log. The problem is that when the AIL is empty, the tail lsn is dervied from the the l_last_sync_lsn, which is the LSN of the last iclog to be written to the log. In most cases this doesn't happen, because the AIL is rarely empty on an active filesystem. However, when it does, it opens up an interesting case when the transaction being committed to the iclog spans multiple iclogs. That is, the first iclog is stamped with the l_last_sync_lsn, and IO is issued. Then the next iclog is setup, the changes copied into the iclog (takes some time), and then the l_last_sync_lsn is stamped into the header and IO is issued. This is still the same transaction, so the tail lsn of both iclogs must be the same for log recovery to find the entire transaction to be able to replay it. The problem arises in that the iclog buffer IO completion updates the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog completes it's IO before the second iclog is filled and has the tail lsn stamped in it, it will stamp the LSN of the first iclog into it's tail lsn field. If the system fails at this point, log recovery will not see a complete transaction, so the transaction will no be replayed. The fix is simple - the l_last_sync_lsn is updated when a iclog buffer IO completes, and this is incorrect. The l_last_sync_lsn shoul dbe updated when a transaction is completed by a iclog buffer IO. That is, only iclog buffers that have transaction commit callbacks attached to them should update the l_last_sync_lsn. This means that the last_sync_lsn will only move forward when a commit record it written, not in the middle of a large transaction that is rolling through multiple iclog buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d2d59692739..46b6986e39b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2461,14 +2461,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; -- cgit v1.2.3 From a00416844b8f4b0106344bdfd90fe45a854b1d05 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 20 Sep 2012 13:16:45 -0500 Subject: xfs: zero allocation_args on the kernel stack Zero the kernel stack space that makes up the xfs_alloc_arg structures. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + fs/xfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_ialloc.c | 1 + 3 files changed, 5 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe..0287f3b1b50 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 848ffa77707..e1545ec2f7d 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 445bf1aef31..c5c4ef4f2bd 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; -- cgit v1.2.3 From 2455881c0b52f87be539c4c7deab1afff4d8a560 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:58 +1000 Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH Certain allocation paths through xfs_bmapi_write() are in situations where we have limited stack available. These are almost always in the buffered IO writeback path when convertion delayed allocation extents to real extents. The current stack switch occurs for userdata allocations, which means we also do stack switches for preallocation, direct IO and unwritten extent conversion, even those these call chains have never been implicated in a stack overrun. Hence, let's target just the single stack overun offended for stack switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that the caller can pass xfs_bmapi_write() to indicate it should switch stacks if it needs to do allocation. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 1 + fs/xfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap.h | 5 ++++- fs/xfs/xfs_iomap.c | 4 +++- 5 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0287f3b1b50..43f791bcd8b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2447,7 +2447,7 @@ xfs_alloc_vextent( { DECLARE_COMPLETION_ONSTACK(done); - if (!args->userdata) + if (!args->stack_switch) return __xfs_alloc_vextent(args); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca..ef7d4885dc2 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg { struct completion *done; struct work_struct work; int result; + char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e1545ec2f7d..91259554df8 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,6 +2441,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4675,6 +4676,9 @@ xfs_bmapi_allocate( return error; } + if (flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce1..b68c598034c 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,7 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f858b903678..a066cf1766a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -575,7 +575,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; -- cgit v1.2.3 From e04426b9202bccd4cfcbc70b2fa2aeca1c86d8f5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:59 +1000 Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate Switching stacks are xfs_alloc_vextent can cause deadlocks when we run out of worker threads on the allocation workqueue. This can occur because xfs_bmap_btalloc can make multiple calls to xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can return with the AGF locked in the current allocation transaction. If we then need to make another allocation, and all the allocation worker contexts are exhausted because the are blocked waiting for the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent work completed to release the AGF. Hence allocation effectively deadlocks. To avoid this, move the stack switch one layer up to xfs_bmapi_allocate() so that all of the allocation attempts in a single switched stack transaction occur in a single worker context. This avoids the problem of an allocation being blocked waiting for a worker thread whilst holding the AGF. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 42 +------------------------------------- fs/xfs/xfs_alloc.h | 4 ---- fs/xfs/xfs_bmap.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_bmap.h | 4 ++++ 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 43f791bcd8b..335206a9c69 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2208,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2418,46 +2418,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ef7d4885dc2..feacb061bab 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; - char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 91259554df8..83d0cf3df93 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,7 +2441,6 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4620,12 +4619,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4658,25 +4656,25 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } - if (flags & XFS_BMAPI_STACK_SWITCH) + if (bma->flags & XFS_BMAPI_STACK_SWITCH) bma->stack_switch = 1; error = xfs_bmap_alloc(bma); @@ -4713,7 +4711,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4741,6 +4739,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4926,6 +4963,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4941,7 +4979,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b68c598034c..5f469c3516e 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -136,6 +136,10 @@ typedef struct xfs_bmalloca { char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* -- cgit v1.2.3 From 1375cb65e87b327a8dd4f920c3e3d837fb40e9c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 9 Oct 2012 14:50:52 +1100 Subject: xfs: growfs: don't read garbage for new secondary superblocks When updating new secondary superblocks in a growfs operation, the superblock buffer is read from the newly grown region of the underlying device. This is not guaranteed to be zero, so violates the underlying assumption that the unused parts of superblocks are zero filled. Get a new buffer for these secondary superblocks to ensure that the unused regions are zero filled correctly. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf..4beaede4327 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -399,9 +399,26 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -423,7 +440,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); -- cgit v1.2.3 From 531c3bdc8662e1a83f8ec80dc3346b1284877c0a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Oct 2012 17:22:30 +1100 Subject: xfs: silence uninitialised f.file warning. Uninitialised variable build warning introduced by 2903ff0 ("switch simple cases of fget_light to fdget"), gcc is not smart enough to work out that the variable is not used uninitialised, and the commit removed the initialisation at declaration that the old variable had. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8305f2ac677..c1df3c623de 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,7 +70,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; -- cgit v1.2.3 From cd856db69c88db438215244571957d812bdc6813 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Sat, 20 Oct 2012 11:08:19 -0300 Subject: xfs: Update inode alloc comments I found some out of date comments while studying the inode allocation code, so I believe it's worth to have these comments updated. It basically rewrites the comment regarding to "call_again" variable, which is not used anymore, but instead, callers of xfs_ialloc() decides if it needs to be called again relying only if ialloc_context is NULL or not. Also did some small changes in another comment that I thought to be pertinent to the current behaviour of these functions and some alignment on both comments. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 6 +++--- fs/xfs/xfs_inode.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c5c4ef4f2bd..37753e1c853 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -877,9 +877,9 @@ error0: * This function is designed to be called twice if it has to do an allocation * to make more free inodes. On the first call, *IO_agbp should be set to NULL. * If an inode is available without having to performn an allocation, an inode - * number is returned. In this case, *IO_agbp would be NULL. If an allocation - * needes to be done, xfs_dialloc would return the current AGI buffer in - * *IO_agbp. The caller should then commit the current transaction, allocate a + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a * new transaction, and call xfs_dialloc() again, passing in the previous value * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI * buffer is locked across the two calls, the second call is guaranteed to have diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bba8f37525b..95f7a73b05c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1104,16 +1104,16 @@ xfs_iread_extents( * set according to the contents of the given cred structure. * * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() - * has a free inode available, call xfs_iget() - * to obtain the in-core version of the allocated inode. Finally, - * fill in the inode and log its initial contents. In this case, - * ialloc_context would be set to NULL and call_again set to false. + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. * - * If xfs_dialloc() does not have an available inode, - * it will replenish its supply by doing an allocation. Since we can - * only do one allocation within a transaction without deadlocks, we - * must commit the current transaction before returning the inode itself. - * In this case, therefore, we will set call_again to true and return. + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. * The caller should then commit the current transaction, start a new * transaction, and call xfs_ialloc() again to actually get the inode. * -- cgit v1.2.3 From 4c05f9ad4d168098b7ce3ffa7098283f94811ed6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:41 +1100 Subject: xfs: invalidate allocbt blocks moved to the free list When we free a block from the alloc btree tree, we move it to the freelist held in the AGFL and mark it busy in the busy extent tree. This typically happens when we merge btree blocks. Once the transaction is committed and checkpointed, the block can remain on the free list for an indefinite amount of time. Now, this isn't the end of the world at this point - if the free list is shortened, the buffer is invalidated in the transaction that moves it back to free space. If the buffer is allocated as metadata from the free list, then all the modifications getted logged, and we have no issues, either. And if it gets allocated as userdata direct from the freelist, it gets invalidated and so will never get written. However, during the time it sits on the free list, pressure on the log can cause the AIL to be pushed and the buffer that covers the block gets pushed for write. IOWs, we end up writing a freed metadata block to disk. Again, this isn't the end of the world because we know from the above we are only writing to free space. The problem, however, is for validation callbacks. If the block was on old btree root block, then the level of the block is going to be higher than the current tree root, and so will fail validation. There may be other inconsistencies in the block as well, and currently we don't care because the block is in free space. Shutting down the filesystem because a freed block doesn't pass write validation, OTOH, is rather unfriendly. So, make sure we always invalidate buffers as they move from the free space trees to the free list so that we guarantee they never get written to disk while on the free list. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8..f7876c6d616 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } -- cgit v1.2.3 From b6aff29f3af7437635ec3d66af9115bb17ba561f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:42 +1100 Subject: xfs: don't vmap inode cluster buffers during free Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 95f7a73b05c..965598eb308 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1760,7 +1760,8 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; -- cgit v1.2.3 From 137fff09b7924507871f8e6294dfe57b7a880332 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 14:23:12 +1100 Subject: xfs: fix buffer shudown reference count mismatch When we shut down the filesystem, we have to unpin and free all the buffers currently active in the CIL. To do this we unpin and remove them in one operation as a result of a failed iclogbuf write. For buffers, we do this removal via a simultated IO completion of after marking the buffer stale. At the time we do this, we have two references to the buffer - the active LRU reference and the buf log item. The LRU reference is removed by marking the buffer stale, and the active CIL reference is by the xfs_buf_iodone() callback that is run by xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone callback). However, ioend processing requires one more reference - that of the IO that it is completing. We don't have this reference, so we free the buffer prematurely and use it after it is freed. For buffers marked with XBF_ASYNC, this leads to assert failures in xfs_buf_rele() on debug kernels because the b_hold count is zero. Fix this by making sure we take the necessary IO reference before starting IO completion processing on the stale buffer, and set the XBF_ASYNC flag to ensure that IO completion processing removes all the active references from the buffer to ensure it is fully torn down. Cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a8d0ed91119..becf4a97efc 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -526,7 +526,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); -- cgit v1.2.3 From 009507b052fa391618eccf9e8c9f484407fd9018 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:44 +1100 Subject: xfs: fix reading of wrapped log data Commit 4439647 ("xfs: reset buffer pointers before freeing them") in 3.0-rc1 introduced a regression when recovering log buffers that wrapped around the end of log. The second part of the log buffer at the start of the physical log was being read into the header buffer rather than the data buffer, and hence recovery was seeing garbage in the data buffer when it got to the region of the log buffer that was incorrectly read. Cc: # 3.0.x, 3.2.x, 3.4.x 3.6.x Reported-by: Torsten Kaiser Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 651c98859b0..3e06333d4bd 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3542,7 +3542,7 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; -- cgit v1.2.3 From 69a58a43f74eb2cb23d9bce2524dae33c289a40f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Oct 2012 14:11:45 -0500 Subject: xfs: report projid32bit feature in geometry call When xfs gained the projid32bit feature, it was never added to the FSGEOMETRY ioctl feature flags, so it's not queryable without this patch. Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 3 ++- fs/xfs/xfs_fsops.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c13fed8c394..0948c043443 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 4beaede4327..7b0a997cf62 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -97,7 +97,9 @@ xfs_fs_geometry( (xfs_sb_version_haslazysbcount(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; -- cgit v1.2.3 From 27b52867925e3aaed090063c1c58a7537e6373f3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:38 -0500 Subject: xfs: add EOFBLOCKS inode tagging/untagging Add the XFS_ICI_EOFBLOCKS_TAG inode tag to identify inodes with speculatively preallocated blocks beyond EOF. An inode is tagged when speculative preallocation occurs and untagged either via truncate down or when post-EOF blocks are freed via release or reclaim. The tag management is intentionally not aggressive to prefer simplicity over the complexity of handling all the corner cases under which post-EOF blocks could be freed (i.e., forward truncation, fallocate, write error conditions, etc.). This means that a tagged inode may or may not have post-EOF blocks after a period of time. The tag is eventually cleared when the inode is released or reclaimed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ag.h | 1 + fs/xfs/xfs_icache.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 3 +++ fs/xfs/xfs_iomap.c | 8 +++++++ fs/xfs/xfs_iops.c | 4 ++++ fs/xfs/xfs_trace.h | 5 +++++ fs/xfs/xfs_vnodeops.c | 2 ++ 7 files changed, 85 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 44d65c1533c..22bd4db011c 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -233,6 +233,7 @@ typedef struct xfs_perag { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9c8703b5cd7..f9afc5ff048 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1128,3 +1128,65 @@ xfs_reclaim_inodes_count( return reclaimable; } +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 222e22f16b4..db3613075dc 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -35,6 +35,9 @@ void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); + int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a066cf1766a..add06b4e9a6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -41,6 +41,7 @@ #include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" +#include "xfs_icache.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -450,6 +451,13 @@ retry: if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap[0]); + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 4e00cf091d2..81f5c495328 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include #include @@ -854,6 +855,9 @@ xfs_setattr_size( * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); } if (mask & ATTR_CTIME) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7d36ccf57f9..6f46e034b76 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -130,6 +130,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, @@ -585,6 +587,9 @@ DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); DEFINE_INODE_EVENT(xfs_dquot_dqdetach); +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2ee1f49da0a..e6e1d11dfdf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -238,6 +238,8 @@ xfs_free_eofblocks( } else { error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); -- cgit v1.2.3 From a454f7428ffa03c8e1321124d9074101b7290be6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:39 -0500 Subject: xfs: support a tag-based inode_ag_iterator Genericize xfs_inode_ag_walk() to support an optional radix tree tag and args argument for the execute function. Create a new wrapper called xfs_inode_ag_iterator_tag() that performs a tag based walk of perag's and inodes. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 56 ++++++++++++++++++++++++++++++++++++++++++------ fs/xfs/xfs_icache.h | 9 ++++++-- fs/xfs/xfs_qm_syscalls.c | 5 +++-- 3 files changed, 59 insertions(+), 11 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f9afc5ff048..2a96dc48ebe 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -516,8 +516,11 @@ xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { uint32_t first_index; int last_error = 0; @@ -536,9 +539,17 @@ restart: int i; rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + if (!nr_found) { rcu_read_unlock(); break; @@ -579,7 +590,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags); + error = execute(batch[i], pag, flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -608,8 +619,10 @@ int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args) { struct xfs_perag *pag; int error = 0; @@ -619,7 +632,36 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +int +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index db3613075dc..54c113478df 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,7 +40,12 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args, int tag); #endif diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7a9071f8855..5f53e75409b 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -846,7 +846,8 @@ STATIC int xfs_dqrele_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int flags, + void *args) { /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || @@ -882,5 +883,5 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); } -- cgit v1.2.3 From 72b53efa4a6125a4c334871c58268c430605819a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:40 -0500 Subject: xfs: create helper to check whether to free eofblocks on inode This check is used in multiple places to determine whether we should check for (and potentially free) post EOF blocks on an inode. Add a helper to consolidate the check. Note that when we remove an inode from the cache (xfs_inactive()), we are required to trim post-EOF blocks even if the inode is marked preallocated or append-only to maintain correct space accounting. The 'force' parameter to xfs_can_free_eofblocks() specifies whether we should ignore the prealloc/append-only status of the inode. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 37 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_vnodeops.c | 19 +++++++------------ 3 files changed, 45 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 965598eb308..7449cb943ef 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3912,3 +3912,40 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1fc2065e010..21b4de3df71 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -585,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *); void xfs_iext_irec_compact_pages(xfs_ifork_t *); void xfs_iext_irec_compact_full(xfs_ifork_t *); void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); +bool xfs_can_free_eofblocks(struct xfs_inode *, bool); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index e6e1d11dfdf..c4c15390020 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -436,11 +436,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (xfs_can_free_eofblocks(ip, false)) { /* * If we can't get the iolock just skip truncating the blocks @@ -516,13 +512,12 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - ip->i_delayed_blks != 0))) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) { error = xfs_free_eofblocks(mp, ip, false); if (error) return VN_INACTIVE_CACHE; -- cgit v1.2.3 From 40165e27617e2a98bf8588001d2f2872fae2fee2 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:41 -0500 Subject: xfs: make xfs_free_eofblocks() non-static, return EAGAIN on trylock failure Turn xfs_free_eofblocks() into a non-static function, return EAGAIN to indicate trylock failure and make sure this error is not propagated in xfs_release(). Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 6 +++--- fs/xfs/xfs_vnodeops.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4c15390020..c2ddd7a4394 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -151,7 +151,7 @@ xfs_readlink( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -STATIC int +int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, @@ -200,7 +200,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); - return 0; + return EAGAIN; } } @@ -463,7 +463,7 @@ xfs_release( return 0; error = xfs_free_eofblocks(mp, ip, true); - if (error) + if (error && error != EAGAIN) return error; /* delalloc blocks after truncation means it really is dirty */ diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 447e146b2ba..52fafc416a0 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -57,5 +57,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); #endif /* _XFS_VNODEOPS_H */ -- cgit v1.2.3 From 41176a68e3f710630feace536d0277a092e206b5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:42 -0500 Subject: xfs: create function to scan and clear EOFBLOCKS inodes xfs_inodes_free_eofblocks() implements scanning functionality for EOFBLOCKS inodes. It uses the AG iterator to walk the tagged inodes and free post-EOF blocks via the xfs_inode_free_eofblocks() execute function. The scan can be invoked in best-effort mode or wait (force) mode. A best-effort scan (default) handles all inodes that do not have a dirty cache and we successfully acquire the io lock via trylock. In wait mode, we continue to cycle through an AG until all inodes are handled. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + 3 files changed, 45 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2a96dc48ebe..d115cb44b10 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1170,6 +1170,49 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags, + void *args) +{ + int ret; + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + ret = xfs_free_eofblocks(ip->i_mount, ip, true); + + /* don't revisit the inode if we're not waiting */ + if (ret == EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + int flags) +{ + ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + NULL, XFS_ICI_EOFBLOCKS_TAG); +} + void xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 54c113478df..cb6b8d0eee6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -37,6 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, int); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6f46e034b76..cb523463207 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -589,6 +589,7 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), -- cgit v1.2.3 From 8ca149de80478441352a8622ea15fae7de703ced Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:12 -0500 Subject: xfs: add XFS_IOC_FREE_EOFBLOCKS ioctl The XFS_IOC_FREE_EOFBLOCKS ioctl allows users to invoke an EOFBLOCKS scan. The xfs_eofblocks structure is defined to support the command parameters (scan mode). Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 17 +++++++++++++++++ fs/xfs/xfs_icache.c | 10 +++++++--- fs/xfs/xfs_icache.h | 2 +- fs/xfs/xfs_ioctl.c | 20 ++++++++++++++++++++ 4 files changed, 45 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0948c043443..0cfa30813b1 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -339,6 +339,22 @@ typedef struct xfs_error_injection { } xfs_error_injection_t; +/* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + __u64 pad[15]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC) + + /* * The user-level Handle Request interface structure. */ @@ -457,6 +473,7 @@ typedef struct xfs_handle { /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d115cb44b10..fbb74c71526 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1206,11 +1206,15 @@ xfs_inode_free_eofblocks( int xfs_icache_free_eofblocks( struct xfs_mount *mp, - int flags) + struct xfs_eofblocks *eofb) { - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - NULL, XFS_ICI_EOFBLOCKS_TAG); + eofb, XFS_ICI_EOFBLOCKS_TAG); } void diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index cb6b8d0eee6..4934a77024c 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -37,7 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_eofblocks(struct xfs_mount *, int); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c1df3c623de..5b20ab0b4f9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -42,6 +42,7 @@ #include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include #include @@ -1602,6 +1603,25 @@ xfs_file_ioctl( error = xfs_errortag_clearall(mp, 1); return -error; + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_eofblocks eofb; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); + + if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) + return -XFS_ERROR(EINVAL); + + if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) + return -XFS_ERROR(EINVAL); + + if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad))) + return -XFS_ERROR(EINVAL); + + error = xfs_icache_free_eofblocks(mp, &eofb); + return -error; + } + default: return -ENOTTY; } -- cgit v1.2.3 From 3e3f9f5863548e870edfcc72e7617ac8ddcad44a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:13 -0500 Subject: xfs: add inode id filtering to eofblocks scan Support inode ID filtering in the eofblocks scan. The caller must set the associated XFS_EOF_FLAGS_*ID bit and ID field. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 14 ++++++++++++-- fs/xfs/xfs_icache.c | 22 ++++++++++++++++++++++ fs/xfs/xfs_ioctl.c | 3 ++- 3 files changed, 36 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0cfa30813b1..a19f9b205c1 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -346,13 +346,23 @@ typedef struct xfs_error_injection { struct xfs_eofblocks { __u32 eof_version; __u32 eof_flags; - __u64 pad[15]; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 pad64[13]; }; /* eof_flags values */ #define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ #define XFS_EOF_FLAGS_VALID \ - (XFS_EOF_FLAGS_SYNC) + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID) /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fbb74c71526..b239da91c43 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1170,6 +1170,21 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if (eofb->eof_flags & XFS_EOF_FLAGS_UID) + return ip->i_d.di_uid == eofb->eof_uid; + else if (eofb->eof_flags & XFS_EOF_FLAGS_GID) + return ip->i_d.di_gid == eofb->eof_gid; + else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID) + return xfs_get_projid(ip) == eofb->eof_prid; + + return 0; +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, @@ -1178,6 +1193,7 @@ xfs_inode_free_eofblocks( void *args) { int ret; + struct xfs_eofblocks *eofb = args; if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1194,6 +1210,12 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; + if (eofb && + (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID| + XFS_EOF_FLAGS_PRID)) && + !xfs_inode_match_id(ip, eofb)) + return 0; + ret = xfs_free_eofblocks(ip->i_mount, ip, true); /* don't revisit the inode if we're not waiting */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5b20ab0b4f9..c1c3ef88a26 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1615,7 +1615,8 @@ xfs_file_ioctl( if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) return -XFS_ERROR(EINVAL); - if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad))) + if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || + memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) return -XFS_ERROR(EINVAL); error = xfs_icache_free_eofblocks(mp, &eofb); -- cgit v1.2.3 From 1b5560488d1ab7c932f6f99385b41116838c3486 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:45 -0500 Subject: xfs: support multiple inode id filtering in eofblocks scan Enhance the eofblocks scan code to filter based on multiply specified inode id values. When multiple inode id values are specified, only inodes that match all id values are selected. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b239da91c43..32908909815 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1175,14 +1175,19 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UID) - return ip->i_d.di_uid == eofb->eof_uid; - else if (eofb->eof_flags & XFS_EOF_FLAGS_GID) - return ip->i_d.di_gid == eofb->eof_gid; - else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID) - return xfs_get_projid(ip) == eofb->eof_prid; + if (eofb->eof_flags & XFS_EOF_FLAGS_UID && + ip->i_d.di_uid != eofb->eof_uid) + return 0; - return 0; + if (eofb->eof_flags & XFS_EOF_FLAGS_GID && + ip->i_d.di_gid != eofb->eof_gid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; } STATIC int @@ -1210,10 +1215,7 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb && - (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID| - XFS_EOF_FLAGS_PRID)) && - !xfs_inode_match_id(ip, eofb)) + if (eofb && !xfs_inode_match_id(ip, eofb)) return 0; ret = xfs_free_eofblocks(ip->i_mount, ip, true); -- cgit v1.2.3 From 00ca79a04bef1a1b30ef8afd992d905b6d986caf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:14 -0500 Subject: xfs: add minimum file size filtering to eofblocks scan Support minimum file size filtering in the eofblocks scan. The caller must set the XFS_EOF_FLAGS_MINFILESIZE flags bit and minimum file size value in bytes. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 7 +++++-- fs/xfs/xfs_icache.c | 11 +++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index a19f9b205c1..6dda3f949b0 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -350,7 +350,8 @@ struct xfs_eofblocks { gid_t eof_gid; prid_t eof_prid; __u32 pad32; - __u64 pad64[13]; + __u64 eof_min_file_size; + __u64 pad64[12]; }; /* eof_flags values */ @@ -358,11 +359,13 @@ struct xfs_eofblocks { #define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ #define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ #define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ #define XFS_EOF_FLAGS_VALID \ (XFS_EOF_FLAGS_SYNC | \ XFS_EOF_FLAGS_UID | \ XFS_EOF_FLAGS_GID | \ - XFS_EOF_FLAGS_PRID) + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 32908909815..906e6dcd2c5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1215,8 +1215,15 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb && !xfs_inode_match_id(ip, eofb)) - return 0; + if (eofb) { + if (!xfs_inode_match_id(ip, eofb)) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + } ret = xfs_free_eofblocks(ip->i_mount, ip, true); -- cgit v1.2.3 From 579b62faa5fb16ffeeb88cda5e2c4e95730881af Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:47 -0500 Subject: xfs: add background scanning to clear eofblocks inodes Create a new mount workqueue and delayed_work to enable background scanning and freeing of eofblocks inodes. The scanner kicks in once speculative preallocation occurs and stops requeueing itself when no eofblocks inodes exist. The scan interval is based on the new 'speculative_prealloc_lifetime' tunable (default to 5m). The background scanner performs unfiltered, best effort scans (which skips inodes under lock contention or with a dirty cache mapping). Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_globals.c | 4 +++- fs/xfs/xfs_icache.c | 29 +++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_mount.c | 2 ++ fs/xfs/xfs_mount.h | 3 +++ fs/xfs/xfs_super.c | 9 +++++++++ fs/xfs/xfs_sysctl.c | 9 +++++++++ fs/xfs/xfs_sysctl.h | 1 + 9 files changed, 58 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cff70b..5399ef222dd 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,7 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second). + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -40,4 +41,5 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 906e6dcd2c5..96e344e3e92 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -615,6 +615,32 @@ restart: return last_error; } +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'background_prealloc_discard_period' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1273,6 +1299,9 @@ xfs_inode_set_eofblocks_tag( XFS_ICI_EOFBLOCKS_TAG); spin_unlock(&ip->i_mount->m_perag_lock); + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 4934a77024c..e0f138c70a2 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -38,6 +38,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +void xfs_eofblocks_worker(struct work_struct *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 828662f70d6..0a134ca5211 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -118,6 +118,7 @@ #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f1c997704c..41ae7e1590f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1428,6 +1428,8 @@ xfs_unmountfs( __uint64_t resblks; int error; + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a631ca3b906..dc306a09f56 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -196,6 +196,8 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ @@ -207,6 +209,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3d9ea947e9f..ab8839b2627 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -874,8 +874,15 @@ xfs_init_mount_workqueues( if (!mp->m_log_workqueue) goto out_destroy_reclaim; + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + return 0; +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -892,6 +899,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); @@ -1393,6 +1401,7 @@ xfs_fs_fill_super( mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); mp->m_super = sb; sb->s_fs_info = mp; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index ee2d2adaa43..2801b5ce6cd 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -202,6 +202,15 @@ static ctl_table xfs_table[] = { .extra1 = &xfs_params.fstrm_timer.min, .extra2 = &xfs_params.fstrm_timer.max, }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index b9937d450f8..bd8e157c20e 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -47,6 +47,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ } xfs_param_t; /* -- cgit v1.2.3 From 07428d7f0ca46087f7f1efa895322bb9dc1ac21d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:44 +1100 Subject: xfs: fix attr tree double split corruption In certain circumstances, a double split of an attribute tree is needed to insert or replace an attribute. In rare situations, this can go wrong, leaving the attribute tree corrupted. In this case, the attr being replaced is the last attr in a leaf node, and the replacement is larger so doesn't fit in the same leaf node. When we have the initial condition of a node format attribute btree with two leaves at index 1 and 2. Call them L1 and L2. The leaf L1 is completely full, there is not a single byte of free space in it. L2 is mostly empty. The attribute being replaced - call it X - is the last attribute in L1. The way an attribute replace is executed is that the replacement attribute - call it Y - is first inserted into the tree, but has an INCOMPLETE flag set on it so that list traversals ignore it. Once this transaction is committed, a second transaction it run to atomically mark Y as COMPLETE and X as INCOMPLETE, so that a traversal will now find Y and skip X. Once that transaction is committed, attribute X is then removed. So, the initial condition is: +--------+ +--------+ | L1 | | L2 | | fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 | | fsp: 0 | | fsp: N | |--------| |--------| | attr A | | attr 1 | |--------| |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr X | | attr n | +--------+ +--------+ So now we go to replace X, and see that L1:fsp = 0 - it is full so we can't insert Y in the same leaf. So we record the the location of attribute X so we can track it for later use, then we split L1 into L1 and L3 and reblance across the two leafs. We end with: +--------+ +--------+ +--------+ | L1 | | L3 | | L2 | | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: N | |--------| |--------| |--------| | attr A | | attr X | | attr 1 | |--------| +--------+ |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And we track that the original attribute is now at L3:0. We then try to insert Y into L1 again, and find that there isn't enough room because the new attribute is larger than the old one. Hence we have to split again to make room for Y. We end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| + INCOMP + +--------+ |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And now we have the new (incomplete) attribute @ L4:0, and the original attribute at L3:0. At this point, the first transaction is committed, and we move to the flipping of the flags. This is where we are supposed to end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| +--------+ + INCOMP + |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ But that doesn't happen properly - the attribute tracking indexes are not pointing to the right locations. What we end up with is both the old attribute to be removed pointing at L4:0 and the new attribute at L4:1. On a debug kernel, this assert fails like so: XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725 because the new attribute location does not exist. On a production kernel, this goes unnoticed and the code proceeds ahead merrily and removes L4 because it thinks that is the block that is no longer needed. This leaves the hash index node pointing to entries L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free space, and so everything is busted. This corruption is caused by the removal of the old attribute triggering a join - it joins everything correctly but then frees the wrong block. xfs_repair will report something like: bad sibling back pointer for block 4 in attribute fork for inode 131 problem with attribute contents in inode 131 would clear attr fork bad nblocks 8 for inode 131, would reset to 3 bad anextents 4 for inode 131, would reset to 0 The problem lies in the assignment of the old/new blocks for tracking purposes when the double leaf split occurs. The first split tries to place the new attribute inside the current leaf (i.e. "inleaf == true") and moves the old attribute (X) to the new block. This sets up the old block/index to L1:X, and newly allocated block to L3:0. It then moves attr X to the new block and tries to insert attr Y at the old index. That fails, so it splits again. With the second split, the rebalance ends up placing the new attr in the second new block - L4:0 - and this is where the code goes wrong. What is does is it sets both the new and old block index to the second new block. Hence it inserts attr Y at the right place (L4:0) but overwrites the current location of the attr to replace that is held in the new block index (currently L3:0). It over writes it with L4:1 - the index we later assert fail on. Hopefully this table will show this in a foramt that is a bit easier to understand: Split old attr index new attr index vanilla patched vanilla patched before 1st L1:26 L1:26 N/A N/A after 1st L3:0 L3:0 L1:26 L1:26 after 2nd L4:0 L3:0 L4:1 L4:0 ^^^^ ^^^^ wrong wrong The fix is surprisingly simple, for all this analysis - just stop the rebalance on the out-of leaf case from overwriting the new attr index - it's already correct for the double split case. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d330111ca73..70eec182977 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); -- cgit v1.2.3 From 7bf7f352194252e6f05981d44fb8cb55668606cd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:45 +1100 Subject: xfs: fix broken error handling in xfs_vm_writepage When we shut down the filesystem, it might first be detected in writeback when we are allocating a inode size transaction. This happens after we have moved all the pages into the writeback state and unlocked them. Unfortunately, if we fail to set up the transaction we then abort writeback and try to invalidate the current page. This then triggers are BUG() in block_invalidatepage() because we are trying to invalidate an unlocked page. Fixing this is a bit of a chicken and egg problem - we can't allocate the transaction until we've clustered all the pages into the IO and we know the size of it (i.e. whether the last block of the IO is beyond the current EOF or not). However, we don't want to hold pages locked for long periods of time, especially while we lock other pages to cluster them into the write. To fix this, we need to make a clear delineation in writeback where errors can only be handled by IO completion processing. That is, once we have marked a page for writeback and unlocked it, we have to report errors via IO completion because we've already started the IO. We may not have submitted any IO, but we've changed the page state to indicate that it is under IO so we must now use the IO completion path to report errors. To do this, add an error field to xfs_submit_ioend() to pass it the error that occurred during the building on the ioend chain. When this is non-zero, mark each ioend with the error and call xfs_finish_ioend() directly rather than building bios. This will immediately push the ioends through completion processing with the error that has occurred. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41..e57e2daa357 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +512,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1078,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1108,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; -- cgit v1.2.3 From 37eb17e604ac7398bbb133c82f281475d704fff7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:46 +1100 Subject: xfs: drop buffer io reference when a bad bio is built Error handling in xfs_buf_ioapply_map() does not handle IO reference counts correctly. We increment the b_io_remaining count before building the bio, but then fail to decrement it in the failure case. This leads to the buffer never running IO completion and releasing the reference that the IO holds, so at unmount we can leak the buffer. This leak is captured by this assert failure during unmount: XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273 This is not a new bug - the b_io_remaining accounting has had this problem for a long, long time - it's just very hard to get a zero length bio being built by this code... Further, the buffer IO error can be overwritten on a multi-segment buffer by subsequent bio completions for partial sections of the buffer. Hence we should only set the buffer error status if the buffer is not already carrying an error status. This ensures that a partial IO error on a multi-segment buffer will not be lost. This part of the problem is a regression, however. cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b86..4b0b8dd1b7b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1284,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } -- cgit v1.2.3 From ee73259b401317117e7f5d4834c270b10b12bc8e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:53 +1100 Subject: xfs: add more attribute tree trace points. Added when debugging recent attribute tree problems to more finely trace code execution through the maze of twisty passages that makes up the attr code. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 18 +++++++++++++++++ fs/xfs/xfs_attr_leaf.c | 37 ++++++++++++++++++++-------------- fs/xfs/xfs_da_btree.c | 6 ++++++ fs/xfs/xfs_trace.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 99 insertions(+), 16 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 0ca1f0be62d..55bbe98e8f8 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1155,6 +1155,8 @@ xfs_attr_leaf_get(xfs_da_args_t *args) struct xfs_buf *bp; int error; + trace_xfs_attr_leaf_get(args); + args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, XFS_ATTR_FORK); @@ -1185,6 +1187,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) int error; struct xfs_buf *bp; + trace_xfs_attr_leaf_list(context); + context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) @@ -1653,6 +1657,8 @@ xfs_attr_fillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level; + trace_xfs_attr_fillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1699,6 +1705,8 @@ xfs_attr_refillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level, error; + trace_xfs_attr_refillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1755,6 +1763,8 @@ xfs_attr_node_get(xfs_da_args_t *args) int error, retval; int i; + trace_xfs_attr_node_get(args); + state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; @@ -1804,6 +1814,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; + trace_xfs_attr_node_list(context); + cursor = context->cursor; cursor->initted = 1; @@ -1959,6 +1971,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; + trace_xfs_attr_rmtval_get(args); + ASSERT(!(args->flags & ATTR_KERNOVAL)); mp = args->dp->i_mount; @@ -2014,6 +2028,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; + trace_xfs_attr_rmtval_set(args); + dp = args->dp; mp = dp->i_mount; src = args->value; @@ -2143,6 +2159,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) xfs_dablk_t lblkno; int valuelen, blkcnt, nmap, error, done, committed; + trace_xfs_attr_rmtval_remove(args); + mp = args->dp->i_mount; /* diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 70eec182977..4bfc732bc9c 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, struct xfs_buf **bpp); STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); +STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, + struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -1071,7 +1072,7 @@ xfs_attr_leaf_add( * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args->trans, bp); + xfs_attr_leaf_compact(args, bp); /* * After compaction, the block is guaranteed to have only one @@ -1102,6 +1103,8 @@ xfs_attr_leaf_add_work( xfs_mount_t *mp; int tmp, i; + trace_xfs_attr_leaf_add_work(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; @@ -1214,15 +1217,17 @@ xfs_attr_leaf_add_work( */ STATIC void xfs_attr_leaf_compact( - struct xfs_trans *trans, - struct xfs_buf *bp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_mount_t *mp; - char *tmpbuffer; + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_trans *trans = args->trans; + struct xfs_mount *mp = trans->t_mountp; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); - mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); @@ -1345,9 +1350,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr2->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk2->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. @@ -1378,9 +1382,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr1->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk1->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. @@ -1577,6 +1580,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_attr_leaf_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -1702,6 +1707,8 @@ xfs_attr_leaf_remove( int tablesize, tmp, i; xfs_mount_t *mp; + trace_xfs_attr_leaf_remove(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7bfb7dd334f..c62e7e6ff50 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -779,6 +779,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_da_node_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -900,6 +902,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) xfs_dahash_t lasthash=0; int level, count; + trace_xfs_da_fixhashpath(state->args); + level = path->active-1; blk = &path->blk[ level ]; switch (blk->magic) { @@ -1417,6 +1421,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, xfs_dablk_t blkno=0; int level, error; + trace_xfs_da_path_shift(state->args); + /* * Roll up the Btree looking for the first block where our * current index is not at the edge of the block. Note that diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index cb523463207..2e137d4a85a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); DECLARE_EVENT_CLASS(xfs_perag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, @@ -1502,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace); DEFINE_DIR2_EVENT(xfs_dir2_node_removename); DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + #define DEFINE_ATTR_EVENT(name) \ -DEFINE_EVENT(xfs_da_class, name, \ +DEFINE_EVENT(xfs_attr_class, name, \ TP_PROTO(struct xfs_da_args *args), \ TP_ARGS(args)) DEFINE_ATTR_EVENT(xfs_attr_sf_add); @@ -1517,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); DEFINE_ATTR_EVENT(xfs_attr_leaf_split); DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); @@ -1532,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ TP_PROTO(struct xfs_da_args *args), \ @@ -1556,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split); DEFINE_DA_EVENT(xfs_da_node_remove); DEFINE_DA_EVENT(xfs_da_node_rebalance); DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); DEFINE_DA_EVENT(xfs_da_swap_lastblock); DEFINE_DA_EVENT(xfs_da_grow_inode); DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_PROTO(struct xfs_da_args *args, int idx), -- cgit v1.2.3 From b64f3a390d3477517cbff7d613e551705540769b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 13 Nov 2012 16:40:27 -0600 Subject: xfs: use btree block initialisation functions in growfs Factor xfs_btree_init_block() to be independent of the btree cursor, and use the function to initialise btree blocks in the growfs code. This makes adding support for different format btree blocks simple. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_btree.c | 33 ++++++++++++++++++++++++--------- fs/xfs/xfs_btree.h | 11 +++++++++++ fs/xfs/xfs_fsops.c | 37 +++++++++++++------------------------ 3 files changed, 48 insertions(+), 33 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e53e317b158..121ea99e615 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -853,18 +853,22 @@ xfs_btree_set_sibling( } } -STATIC void +void xfs_btree_init_block( - struct xfs_btree_cur *cur, - int level, - int numrecs, - struct xfs_btree_block *new) /* new block */ + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags) { - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); + + new->bb_magic = cpu_to_be32(magic); new->bb_level = cpu_to_be16(level); new->bb_numrecs = cpu_to_be16(numrecs); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (flags & XFS_BTREE_LONG_PTRS) { new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { @@ -873,6 +877,17 @@ xfs_btree_init_block( } } +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_buf *bp) +{ + xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], + level, numrecs, cur->bc_flags); +} + /* * Return true if ptr is the last record in the btree and * we need to track updateÑ• to this record. The decision @@ -2183,7 +2198,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); /* * Split the entries between the old and the new block evenly. @@ -2492,7 +2507,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 5b240de104c..c9cf2d00e23 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -378,6 +378,17 @@ xfs_btree_reada_bufs( xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count); /* count of filesystem blocks */ +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags); /* * Common btree core entry points. diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7b0a997cf62..a5034af35db 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -125,7 +125,6 @@ xfs_growfs_data_private( xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - struct xfs_btree_block *block; xfs_buf_t *bp; int bucket; int dpct; @@ -263,17 +262,14 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -289,18 +285,15 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,13 +309,9 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); - block->bb_level = 0; - block->bb_numrecs = 0; - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) -- cgit v1.2.3 From fd23683c3b1ab905cba61ea2981c156f4bf52845 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:59 +1100 Subject: xfs: growfs: use uncached buffers for new headers When writing the new AG headers to disk, we can't attach write verifiers because they have a dependency on the struct xfs-perag being attached to the buffer to be fully initialised and growfs can't fully initialise them until later in the process. The simplest way to avoid this problem is to use uncached buffers for writing the new headers. These buffers don't have the xfs-perag attached to them, so it's simple to detect in the write verifier and be able to skip the checks that need the xfs-perag. This enables us to attach the appropriate buffer ops to the buffer and hence calculate CRCs on the way to disk. IT also means that the buffer is torn down immediately, and so the first access to the AG headers will re-read the header from disk and perform full verification of the buffer. This way we also can catch corruptions due to problems that went undetected in growfs. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 63 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a5034af35db..2196830bf5c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -114,6 +114,26 @@ xfs_fs_geometry( return 0; } +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + + return bp; +} + static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ @@ -189,15 +209,15 @@ xfs_growfs_data_private( /* * AG freelist header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; } + agf = XFS_BUF_TO_AGF(bp); - memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(agno); @@ -226,15 +246,15 @@ xfs_growfs_data_private( /* * AG inode header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; } + agi = XFS_BUF_TO_AGI(bp); - memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(agno); @@ -255,16 +275,16 @@ xfs_growfs_data_private( /* * BNO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); + if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -278,16 +298,15 @@ xfs_growfs_data_private( /* * CNT btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -302,14 +321,14 @@ xfs_growfs_data_private( /* * INO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); error = xfs_bwrite(bp); -- cgit v1.2.3 From de497688daaabbab425a8a969528272ec1d962a6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:00 +1100 Subject: xfs: make growfs initialise the AGFL header For verification purposes, AGFLs need to be initialised to a known set of values. For upcoming CRC changes, they are also headers that need to be initialised. Currently, growfs does neither for the AGFLs - it ignores them completely. Add initialisation of the AGFL to be full of invalid block numbers (NULLAGBLOCK) to put the infrastructure in place needed for CRC support. Includes a comment clarification from Jeff Liu. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2196830bf5c..bd9cb7f0b07 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -140,6 +140,7 @@ xfs_growfs_data_private( xfs_growfs_data_t *in) /* growfs data input struct */ { xfs_agf_t *agf; + struct xfs_agfl *agfl; xfs_agi_t *agi; xfs_agnumber_t agno; xfs_extlen_t agsize; @@ -207,7 +208,7 @@ xfs_growfs_data_private( nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* - * AG freelist header block + * AG freespace header block */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), @@ -243,6 +244,26 @@ xfs_growfs_data_private( if (error) goto error0; + /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (!bp) { + error = ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + /* * AG inode header block */ -- cgit v1.2.3 From f5b8911b67eb4f15d95d5e5324d376d4a49d56e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:42:47 +1100 Subject: xfs: remove xfs_tosspages It's a buggy, unnecessary wrapper that is duplicating truncate_pagecache_range(). When replacing the call in xfs_change_file_space(), also ensure that the length being allocated/freed is always positive before making any changes. These checks are done in the lower extent manipulation functions, too, but we need to do them before any page cache operations. Reported-by: Andrew Dahl Signed-off-by: Dave Chinner Reviewed-By: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_dfrag.c | 3 +-- fs/xfs/xfs_fs_subr.c | 12 ------------ fs/xfs/xfs_vnodeops.c | 30 +++++++++++++++++++++++++----- fs/xfs/xfs_vnodeops.h | 2 -- 4 files changed, 26 insertions(+), 21 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b9b8646e62d..b2c63a28afa 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -315,8 +315,7 @@ xfs_swap_extents( * are safe. We don't really care if non-io related * fields change. */ - - xfs_tosspages(ip, 0, -1, FI_REMAPF); + truncate_pagecache_range(VFS_I(ip), 0, -1); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index 652b875a9d4..d49de3d7045 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -25,18 +25,6 @@ * note: all filemap functions return negative error codes. These * need to be inverted before returning to the xfs core functions. */ -void -xfs_tosspages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - /* can't toss partial tail pages, so mask them out */ - last &= ~(PAGE_SIZE - 1); - truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); -} - int xfs_flushinval_pages( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c2ddd7a4394..de3702a57e5 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2118,7 +2118,7 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t llen; + xfs_off_t end; xfs_trans_t *tp; struct iattr iattr; int prealloc_type; @@ -2139,12 +2139,30 @@ xfs_change_file_space( return XFS_ERROR(EINVAL); } - llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) + return XFS_ERROR(EINVAL); + break; + default: + bf->l_len = 0; + break; + } if (bf->l_start < 0 || bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + llen < 0 || - bf->l_start + llen > mp->m_super->s_maxbytes) + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) return XFS_ERROR(EINVAL); bf->l_whence = 0; @@ -2169,7 +2187,9 @@ xfs_change_file_space( switch (cmd) { case XFS_IOC_ZERO_RANGE: prealloc_type |= XFS_BMAPI_CONVERT; - xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); + end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; + if (startoffset > end) + truncate_pagecache_range(VFS_I(ip), startoffset, end); /* FALLTHRU */ case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 52fafc416a0..d48141d6bc3 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, - xfs_off_t last, int fiopt); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, -- cgit v1.2.3 From d6638ae244f6323fcdf85e72eb4a5af6f6212893 Mon Sep 17 00:00:00 2001 From: Andrew Dahl Date: Wed, 14 Nov 2012 12:52:26 -0600 Subject: xfs: reverse the check on XFS_IOC_ZERO_RANGE Reversing the check on XFS_IOC_ZERO_RANGE. Range should be zeroed if the start is less than or equal to the end. Signed-off-by: Andrew Dahl Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index de3702a57e5..46a7a5de5d6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2188,7 +2188,7 @@ xfs_change_file_space( case XFS_IOC_ZERO_RANGE: prealloc_type |= XFS_BMAPI_CONVERT; end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; - if (startoffset > end) + if (startoffset <= end) truncate_pagecache_range(VFS_I(ip), startoffset, end); /* FALLTHRU */ case XFS_IOC_RESVSP: -- cgit v1.2.3 From 95eacf0f71b7682a05b8242c49c68e8e4bb673e3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:55 +1100 Subject: xfs: remove xfs_wait_on_pages() It's just a simple wrapper around a VFS function that is only called by another function in xfs_fs_subr.c. Remove it and call the VFS function directly. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_fs_subr.c | 18 ++---------------- fs/xfs/xfs_vnodeops.h | 1 - 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index d49de3d7045..33658234dfc 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -62,23 +62,9 @@ xfs_flush_pages( last == -1 ? LLONG_MAX : last); if (flags & XBF_ASYNC) return ret; - ret2 = xfs_wait_on_pages(ip, first, last); + ret2 = -filemap_fdatawait_range(mapping, first, + last == -1 ? XFS_ISIZE(ip) - 1 : last); if (!ret) ret = ret2; return ret; } - -int -xfs_wait_on_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { - return -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - } - return 0; -} diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index d48141d6bc3..c8ad48b61a2 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -52,7 +52,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); -int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.3 From 4bc1ea6b8ddd4f2bd78944fbe5a1042ac14b1f5f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:56 +1100 Subject: xfs: remove xfs_flush_pages It is a complex wrapper around VFS functions, but there are VFS functions that provide exactly the same functionality. Call the VFS functions directly and remove the unnecessary indirection and complexity. We don't need to care about clearing the XFS_ITRUNCATED flag, as that is done during .writepages. Hence is cleared by the VFS writeback path if there is anything to write back during the flush. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_fs_subr.c | 24 ------------------------ fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_vnodeops.c | 7 +++++-- fs/xfs/xfs_vnodeops.h | 2 -- 6 files changed, 9 insertions(+), 32 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e57e2daa357..71361da1f77 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1641,7 +1641,7 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 83d0cf3df93..a60f3d1f151 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5599,7 +5599,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; } diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index 33658234dfc..b5380893728 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -44,27 +44,3 @@ xfs_flushinval_pages( truncate_inode_pages_range(mapping, first, last); return -ret; } - -int -xfs_flush_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - uint64_t flags, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - int ret2; - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (flags & XBF_ASYNC) - return ret; - ret2 = -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - if (!ret) - ret = ret2; - return ret; -} diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 81f5c495328..d82efaa2ac7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -780,8 +780,8 @@ xfs_setattr_size( * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, - FI_NONE); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 46a7a5de5d6..c00326afa7b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -428,8 +428,11 @@ xfs_release( truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated) { xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } } } diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index c8ad48b61a2..73cb3cb15f7 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -50,8 +50,6 @@ int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); -int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, uint64_t flags, int fiopt); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.3 From fb59581404ab7ec5075299065c22cb211a9262a9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:57 +1100 Subject: xfs: remove xfs_flushinval_pages It's just a simple wrapper around VFS functionality, and is actually bugging in that it doesn't remove mappings before invalidating the page cache. Remove it and replace it with the correct VFS functionality. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/Makefile | 1 - fs/xfs/xfs_dfrag.c | 10 ++++------ fs/xfs/xfs_file.c | 23 ++++++++++++----------- fs/xfs/xfs_fs_subr.c | 46 ---------------------------------------------- fs/xfs/xfs_vnodeops.c | 11 +++++------ fs/xfs/xfs_vnodeops.h | 2 -- 6 files changed, 21 insertions(+), 72 deletions(-) delete mode 100644 fs/xfs/xfs_fs_subr.c (limited to 'fs/xfs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index e65357bb3dc..d02201df855 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -37,7 +37,6 @@ xfs-y += xfs_aops.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ - xfs_fs_subr.o \ xfs_globals.o \ xfs_icache.o \ xfs_ioctl.o \ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b2c63a28afa..d0e9c74d3d9 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,12 +246,10 @@ xfs_swap_extents( goto out_unlock; } - if (VN_CACHED(VFS_I(tip)) != 0) { - error = xfs_flushinval_pages(tip, 0, -1, - FI_REMAPF_LOCKED); - if (error) - goto out_unlock; - } + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(ip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index daf4066c24b..c42f99e71f1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -255,15 +255,14 @@ xfs_file_aio_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((iocb->ki_pos & target->bt_smask) || - (size & target->bt_smask)) { - if (iocb->ki_pos == i_size_read(inode)) + if ((pos & target->bt_smask) || (size & target->bt_smask)) { + if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } - n = mp->m_super->s_maxbytes - iocb->ki_pos; + n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; @@ -289,20 +288,21 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, - (iocb->ki_pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, -1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } + truncate_pagecache_range(VFS_I(ip), pos, -1); } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -670,10 +670,11 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, - FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, -1); if (ret) goto out; + truncate_pagecache_range(VFS_I(ip), pos, -1); } /* diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c deleted file mode 100644 index b5380893728..00000000000 --- a/fs/xfs/xfs_fs_subr.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_trace.h" - -/* - * note: all filemap functions return negative error codes. These - * need to be inverted before returning to the xfs core functions. - */ -int -xfs_flushinval_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - - trace_xfs_pagecache_inval(ip, first, last); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (!ret) - truncate_inode_pages_range(mapping, first, last); - return -ret; -} diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c00326afa7b..81c61fd1789 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1958,12 +1958,11 @@ xfs_free_file_space( rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - - if (VN_CACHED(VFS_I(ip)) != 0) { - error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_iolock; - } + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out_unlock_iolock; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); /* * Need to zero the stuff we're not freeing, on disk. diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 73cb3cb15f7..91a03fa3814 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, int fiopt); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.3 From c3f8fc73ac97b76a12692088ef9cace9af8422c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:01 +1100 Subject: xfs: make buffer read verication an IO completion function Add a verifier function callback capability to the buffer read interfaces. This will be used by the callers to supply a function that verifies the contents of the buffer when it is read from disk. This patch does not provide callback functions, but simply modifies the interfaces to allow them to be called. The reason for adding this to the read interfaces is that it is very difficult to tell fom the outside is a buffer was just read from disk or whether we just pulled it out of cache. Supplying a callbck allows the buffer cache to use it's internal knowledge of the buffer to execute it only when the buffer is read from disk. It is intended that the verifier functions will mark the buffer with an EFSCORRUPTED error when verification fails. This allows the reading context to distinguish a verification error from an IO error, and potentially take further actions on the buffer (e.g. attempt repair) based on the error reported. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 4 ++-- fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_btree.c | 21 ++++++++++++--------- fs/xfs/xfs_buf.c | 13 +++++++++---- fs/xfs/xfs_buf.h | 20 ++++++++++++-------- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dquot.c | 4 ++-- fs/xfs/xfs_fsops.c | 4 ++-- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 3 +-- fs/xfs/xfs_log_recover.c | 8 +++++--- fs/xfs/xfs_mount.c | 6 +++--- fs/xfs/xfs_qm.c | 5 +++-- fs/xfs/xfs_rtalloc.c | 6 +++--- fs/xfs/xfs_trans.h | 19 ++++++++----------- fs/xfs/xfs_trans_buf.c | 9 ++++++--- fs/xfs/xfs_vnodeops.c | 2 +- 19 files changed, 75 insertions(+), 61 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 335206a9c69..21c3db08fd0 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -447,7 +447,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2110,7 +2110,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 55bbe98e8f8..474c57a43cc 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1994,7 +1994,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp); + dblkno, blkcnt, 0, &bp, NULL); if (error) return(error); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 121ea99e615..7e791160092 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -266,9 +266,12 @@ xfs_btree_dup_cursor( for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, NULL); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; @@ -624,10 +627,10 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, NULL); + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); @@ -650,7 +653,7 @@ xfs_btree_reada_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } /* @@ -670,7 +673,7 @@ xfs_btree_reada_bufs( ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } STATIC int @@ -1013,7 +1016,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp); + mp->m_bsize, flags, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4b0b8dd1b7b..0298dd68479 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -654,7 +654,8 @@ xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { struct xfs_buf *bp; @@ -666,6 +667,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_iodone = verify; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -691,13 +693,14 @@ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, - int nmaps) + int nmaps, + xfs_buf_iodone_t verify) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); } /* @@ -709,7 +712,8 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; int error; @@ -723,6 +727,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; + bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7c0b6a0a155..677b1dc822f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -100,6 +100,7 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + #define XB_PAGES 2 struct xfs_buf_map { @@ -159,7 +160,6 @@ typedef struct xfs_buf { #endif } xfs_buf_t; - /* Finding and Reading Buffers */ struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -196,9 +196,10 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); + xfs_buf_flags_t flags, xfs_buf_iodone_t verify); void xfs_buf_readahead_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps); + struct xfs_buf_map *map, int nmaps, + xfs_buf_iodone_t verify); static inline struct xfs_buf * xfs_buf_get( @@ -216,20 +217,22 @@ xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags); + return xfs_buf_read_map(target, &map, 1, flags, verify); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1); + return xfs_buf_readahead_map(target, &map, 1, verify); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -239,7 +242,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags); + xfs_daddr_t daddr, size_t numblks, int flags, + xfs_buf_iodone_t verify); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c62e7e6ff50..4af8bad7068 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2161,7 +2161,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp); + mapp, nmap, 0, &bp, NULL); if (error) goto out_free; @@ -2237,7 +2237,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0b296253bd0..bac86984e40 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -926,7 +926,7 @@ xfs_dir2_leaf_readbuf( XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + mip->ra_offset), - (int)BTOBB(mp->m_dirblksize)); + (int)BTOBB(mp->m_dirblksize), NULL); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bf27fcca484..e95f800333d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -439,7 +439,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp); + 0, &bp, NULL); if (error || !bp) return XFS_ERROR(error); } @@ -920,7 +920,7 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index bd9cb7f0b07..5440768ec41 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -168,7 +168,7 @@ xfs_growfs_data_private( dpct = pct - mp->m_sb.sb_imax_pct; bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -439,7 +439,7 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 37753e1c853..12e3dead439 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1490,7 +1490,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7449cb943ef..8d696301048 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -408,7 +408,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp); + (int)imap->im_len, buf_flags, &bp, NULL); if (error) { if (error != EAGAIN) { xfs_warn(mp, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46b6986e39b..1d6d2ee0849 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1129,8 +1129,7 @@ xlog_iodone(xfs_buf_t *bp) * with it being freed after writing the unmount record to the * log. */ - -} /* xlog_iodone */ +} /* * Return size of each in-core log record buffer. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3e06333d4bd..eb1e29ff0c7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2144,7 +2144,7 @@ xlog_recover_buffer_pass2( buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -2237,7 +2237,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + NULL); if (!bp) { error = ENOMEM; goto error; @@ -2548,7 +2549,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); if (error) return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 41ae7e1590f..d5402b0eb6a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -652,7 +652,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0); + BTOBB(sector_size), 0, NULL); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1002,7 +1002,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1017,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 48c750b0e83..688f608b366 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) break; @@ -979,7 +979,8 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - mp->m_quotainfo->qi_dqchunklen); + mp->m_quotainfo->qi_dqchunklen, + NULL); rablkno++; } } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a69e0b4750a..b271ed939d7 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -870,7 +870,7 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp); + mp->m_bsize, 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1873,7 +1873,7 @@ xfs_growfs_rt( */ bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2220,7 +2220,7 @@ xfs_rtmount_init( } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index db056544cbb..f02d4029650 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -464,10 +464,7 @@ xfs_trans_get_buf( int numblks, uint flags) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_get_buf_map(tp, target, &map, 1, flags); } @@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp); + struct xfs_buf **bpp, + xfs_buf_iodone_t verify); static inline int xfs_trans_read_buf( @@ -486,13 +484,12 @@ xfs_trans_read_buf( xfs_daddr_t blkno, int numblks, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; - return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, verify); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6311b99c267..977628207b4 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -257,7 +257,8 @@ xfs_trans_read_buf_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -265,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -312,7 +313,9 @@ xfs_trans_read_buf_map( if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); + bp->b_iodone = verify; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -349,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 81c61fd1789..26880793fec 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -80,7 +80,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; -- cgit v1.2.3 From eab4e63368b4cfa597dbdac66d1a7a836a693b7d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:02 +1100 Subject: xfs: uncached buffer reads need to return an error With verification being done as an IO completion callback, different errors can be returned from a read. Uncached reads only return a buffer or NULL on failure, which means the verification error cannot be returned to the caller. Split the error handling for these reads into two - a failure to get a buffer will still return NULL, but a read error will return a referenced buffer with b_error set rather than NULL. The caller is responsible for checking the error state of the buffer returned. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 9 ++------- fs/xfs/xfs_fsops.c | 5 +++++ fs/xfs/xfs_mount.c | 6 ++++++ fs/xfs/xfs_rtalloc.c | 9 ++++++++- 4 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0298dd68479..fbc965fc075 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -715,8 +715,7 @@ xfs_buf_read_uncached( int flags, xfs_buf_iodone_t verify) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) @@ -730,11 +729,7 @@ xfs_buf_read_uncached( bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_relse(bp); - return NULL; - } + xfs_buf_iowait(bp); return bp; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5440768ec41..f35f8d7731f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -171,6 +171,11 @@ xfs_growfs_data_private( XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d5402b0eb6a..df6d0b2aade 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,6 +658,12 @@ reread: xfs_warn(mp, "SB buffer read failed"); return EIO; } + if (bp->b_error) { + error = bp->b_error; + if (loud) + xfs_warn(mp, "SB validate failed"); + goto release_buf; + } /* * Initialize the mount structure from the superblock. diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b271ed939d7..98dc670d3ee 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1876,6 +1876,11 @@ xfs_growfs_rt( XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); /* @@ -2221,8 +2226,10 @@ xfs_rtmount_init( bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) { + if (!bp || bp->b_error) { xfs_warn(mp, "realtime device size check failed"); + if (bp) + xfs_buf_relse(bp); return EIO; } xfs_buf_relse(bp); -- cgit v1.2.3 From 98021821a502db347bd9c7671beeee6e8ce07ea6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:03 +1100 Subject: xfs: verify superblocks as they are read from disk Add a superblock verify callback function and pass it into the buffer read functions. Remove the now redundant verification code that is currently in use. Adding verification shows that secondary superblocks never have their "sb_inprogress" flag cleared by mkfs.xfs, so when validating the secondary superblocks during a grow operation we have to avoid checking this field. Even if we fix mkfs, we will still have to ignore this field for verification purposes unless a version of mkfs that does not have this bug was used. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 4 +- fs/xfs/xfs_log_recover.c | 5 ++- fs/xfs/xfs_mount.c | 98 ++++++++++++++++++++++++++++++------------------ fs/xfs/xfs_mount.h | 3 +- 4 files changed, 69 insertions(+), 41 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f35f8d7731f..cb65b067ed3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -444,7 +444,8 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &bp, + xfs_sb_read_verify); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), @@ -462,6 +463,7 @@ xfs_growfs_data_private( break; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + /* * If we get an error writing out the alternate superblocks, * just issue a warning and continue. The real work is diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index eb1e29ff0c7..924a4bc3d49 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3692,13 +3692,14 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); + bp->b_iodone = xfs_sb_read_verify; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -3710,7 +3711,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index df6d0b2aade..bff18d73c61 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -304,9 +304,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - int flags) + bool check_inprogress) { - int loud = !(flags & XFS_MFSI_QUIET); /* * If the log device and data device have the @@ -316,21 +315,18 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - if (loud) - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!xfs_sb_good_version(sbp)) { - if (loud) - xfs_warn(mp, "bad version"); + xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -338,8 +334,7 @@ xfs_mount_validate_sb( if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -373,8 +368,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - if (loud) - XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_CORRUPTION_ERROR("SB sanity check failed", XFS_ERRLEVEL_LOW, mp, sbp); return XFS_ERROR(EFSCORRUPTED); } @@ -383,12 +377,10 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - if (loud) { - xfs_warn(mp, + xfs_warn(mp, "File system with blocksize %d bytes. " "Only pagesize (%ld) or less will currently work.", sbp->sb_blocksize, PAGE_SIZE); - } return XFS_ERROR(ENOSYS); } @@ -402,23 +394,20 @@ xfs_mount_validate_sb( case 2048: break; default: - if (loud) - xfs_warn(mp, "inode size of %d bytes not supported", + xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return XFS_ERROR(ENOSYS); } if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "file system too large to be mounted on this system."); return XFS_ERROR(EFBIG); } - if (unlikely(sbp->sb_inprogress)) { - if (loud) - xfs_warn(mp, "file system busy"); + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } @@ -426,9 +415,7 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - if (loud) - xfs_warn(mp, - "file system using version 1 directory format"); + xfs_warn(mp, "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -521,11 +508,9 @@ out_unwind: void xfs_sb_from_disk( - struct xfs_mount *mp, + struct xfs_sb *to, xfs_dsb_t *from) { - struct xfs_sb *to = &mp->m_sb; - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -627,6 +612,50 @@ xfs_sb_to_disk( } } +void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + int error; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + if (error) + xfs_buf_ioerror(bp, error); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, the run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + if (sb.sb_magicnum == XFS_SB_MAGIC) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EFSCORRUPTED); +} + /* * xfs_readsb * @@ -652,7 +681,9 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, NULL); + BTOBB(sector_size), 0, + loud ? xfs_sb_read_verify + : xfs_sb_quiet_read_verify); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -667,15 +698,8 @@ reread: /* * Initialize the mount structure from the superblock. - * But first do some basic consistency checking. */ - xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); - if (error) { - if (loud) - xfs_warn(mp, "SB validate failed"); - goto release_buf; - } + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); /* * We must be able to do sector-sized and sector-aligned IO. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dc306a09f56..de9089acc61 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,10 +385,11 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ +extern void xfs_sb_read_verify(struct xfs_buf *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); #endif /* __XFS_MOUNT_H__ */ -- cgit v1.2.3 From 5d5f527d13369d0047d52b7ac4ddee4f8c0ad173 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:44:56 +1100 Subject: xfs: verify AGF blocks as they are read from disk Add an AGF block verify callback function and pass it into the buffer read functions. This replaces the existing verification that is done after the read completes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 68 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 26 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 21c3db08fd0..c9eb955a49c 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2091,6 +2091,47 @@ xfs_alloc_put_freelist( return 0; } +static void +xfs_agf_read_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf; + int agf_ok; + + agf = XFS_BUF_TO_AGF(bp); + + agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == + bp->b_pag->pag_agno; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group header (free/alloc section). */ @@ -2102,44 +2143,19 @@ xfs_read_agf( int flags, /* XFS_BUF_ */ struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - struct xfs_agf *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); + XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify); if (error) return error; if (!*bpp) return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - - /* - * Validate the magic number of the agf block. - */ - agf_ok = - agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_seqno) == agno; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } -- cgit v1.2.3 From 3702ce6ed71cd60451ab278088863456dcb0dd99 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:05 +1100 Subject: xfs: verify AGI blocks as they are read from disk Add an AGI block verify callback function and pass it into the buffer read functions. Remove the now redundant verification code that is currently in use. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 56 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 12e3dead439..5bd255e5f7b 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1472,6 +1472,40 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int agi_ok; + + /* + * Validate the magic number of the agi block. + */ + agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == + bp->b_pag->pag_agno; + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_check_agi_unlinked(agi); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group header (inode allocation section) */ @@ -1482,38 +1516,18 @@ xfs_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp) /* allocation group hdr buf */ { - struct xfs_agi *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - agi = XFS_BUF_TO_AGI(*bpp); - - /* - * Validate the magic number of the agi block. - */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && - be32_to_cpu(agi->agi_seqno) == agno; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_buf_set_ref(*bpp, XFS_AGI_REF); - - xfs_check_agi_unlinked(agi); return 0; } -- cgit v1.2.3 From bb80c6d79a3b0f9b6c3236a4bec021c72615bfd1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:06 +1100 Subject: xfs: verify AGFL blocks as they are read from disk Add an AGFL block verify callback function and pass it into the buffer read functions. While this commit adds verification code to the AGFL, it cannot be used reliably until the CRC format change comes along as mkfs does not initialise the full AGFL. Hence it can be full of garbage at the first mount and will fail verification right now. CRC enabled filesystems won't have this problem, so leave the code that has already been written ifdef'd out until the proper time. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index c9eb955a49c..38b4ab8957f 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,6 +430,43 @@ xfs_alloc_fixup_trees( return 0; } +void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ +#ifdef WHEN_CRCS_COME_ALONG + /* + * we cannot actually do any verification of the AGFL because mkfs does + * not initialise the AGFL to zero or NULL. Hence the only valid part of + * the AGFL is what the AGF says is active. We can't get to the AGF, so + * we can't verify just those entries are valid. + * + * This problem goes away when the CRC format change comes along as that + * requires the AGFL to be initialised by mkfs. At that point, we can + * verify the blocks in the agfl -active or not- lie within the bounds + * of the AG. Until then, just leave this check ifdef'd out. + */ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int agfl_ok = 1; + + int i; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + agfl_ok = 0; + } + + if (!agfl_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +#endif + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group free block array. */ @@ -447,7 +484,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); -- cgit v1.2.3 From af133e8606d32c2aed43870491ebbdc56feec8a8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:07 +1100 Subject: xfs: verify inode buffers as they are read from disk Add an inode buffer verify callback function and pass it into the buffer read functions. Inodes are special in that the verbose checks will be done when reading the inode, but we still need to sanity check the buffer when that is first read. Always verify the magic numbers in all inodes in the buffer, rather than jus ton debug kernels. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 100 +++++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d696301048..514eac913f1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,6 +382,46 @@ xfs_inobp_check( } #endif +static void +xfs_inode_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); +#ifdef DEBUG + xfs_emerg(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); + ASSERT(0); +#endif + } + } + xfs_inobp_check(mp, bp); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the @@ -396,71 +436,33 @@ xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, - struct xfs_dinode **dipp, + struct xfs_dinode **dipp, struct xfs_buf **bpp, uint buf_flags, uint iget_flags) { struct xfs_buf *bp; int error; - int i; - int ni; buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, NULL); + (int)imap->im_len, buf_flags, &bp, + xfs_inode_buf_verify); if (error) { - if (error != EAGAIN) { - xfs_warn(mp, - "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - } else { + if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); + return error; } - return error; - } - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - */ -#ifdef DEBUG - ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_UNTRUSTED) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)imap->im_blkno, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; } - xfs_inobp_check(mp, bp); - *bpp = bp; *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; -- cgit v1.2.3 From 3d3e6f64e22c94115d47de670611bcd3ecda3796 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:08 +1100 Subject: xfs: verify btree blocks as they are read from disk Add an btree block verify callback function and pass it into the buffer read functions. Because each different btree block type requires different verification, add a function to the ops structure that is called from the generic code. Also, propagate the verification callback functions through the readahead functions, and into the external bmap and bulkstat inode readahead code that uses the generic btree buffer read functions. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 61 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap.c | 60 +++++++++++++++++++++++++----------------- fs/xfs/xfs_bmap_btree.c | 47 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap_btree.h | 1 + fs/xfs/xfs_btree.c | 66 ++++++++++++++++++++++++----------------------- fs/xfs/xfs_btree.h | 10 ++++--- fs/xfs/xfs_ialloc_btree.c | 40 ++++++++++++++++++++++++++++ fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_itable.c | 3 ++- 10 files changed, 230 insertions(+), 61 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f7876c6d616..46961e52e9b 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,6 +272,66 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level as the + * perag is not fully initialised and hence not attached to the buffer. + * In this case, check against the maximum tree depth. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + default: + sblock_ok = 0; + break; + } + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -327,6 +387,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, + .read_verify = xfs_allocbt_read_verify, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a60f3d1f151..9ae7aba52e0 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) @@ -4078,8 +4079,9 @@ xfs_bmap_read_extents( * pointer (leftmost) at each level. */ while (level-- > 0) { - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( @@ -4124,7 +4126,8 @@ xfs_bmap_read_extents( */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1); + xfs_btree_reada_bufl(mp, nextbno, 1, + xfs_bmbt_read_verify); /* * Copy records into the extent records. */ @@ -4156,8 +4159,9 @@ xfs_bmap_read_extents( */ if (bno == NULLFSBLOCK) break; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); } @@ -5868,15 +5872,16 @@ xfs_bmap_check_leaf_extents( */ while (level-- > 0) { /* See if buf is in cur first */ + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( xfs_bmap_sanity_check(mp, bp, level), @@ -5953,15 +5958,16 @@ xfs_bmap_check_leaf_extents( if (bno == NULLFSBLOCK) break; + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); } if (bp_release) { @@ -6052,7 +6058,9 @@ xfs_bmap_count_tree( struct xfs_btree_block *block, *nextblock; int numrecs; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); @@ -6061,8 +6069,10 @@ xfs_bmap_count_tree( /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { - if ((error = xfs_btree_read_bufl(mp, tp, nextbno, - 0, &nbp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; nextblock = XFS_BUF_TO_BLOCK(nbp); @@ -6091,8 +6101,10 @@ xfs_bmap_count_tree( if (nextbno == NULLFSBLOCK) break; bno = nextbno; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 862084a47a7..bddca9b9286 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -36,6 +36,7 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_trace.h" /* * Determine the extent state. @@ -707,6 +708,51 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } +void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int lblock_ok; /* block passes checks */ + + /* magic number and level verification. + * + * We don't know waht fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && + level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); + + /* numrecs verification */ + lblock_ok = lblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + + /* sibling pointer verification */ + lblock_ok = lblock_ok && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (!lblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -746,6 +792,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .read_verify = xfs_bmbt_read_verify, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e66c4ea0f8..1d00fbe9dd7 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -232,6 +232,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern void xfs_bmbt_read_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 7e791160092..ef1066078c3 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -270,7 +270,8 @@ xfs_btree_dup_cursor( if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_BUF_ADDR(bp), mp->m_bsize, - 0, &bp, NULL); + 0, &bp, + cur->bc_ops->read_verify); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -612,23 +613,24 @@ xfs_btree_offsets( * Get a buffer for the block, return it read in. * Long-form addressing. */ -int /* error */ +int xfs_btree_read_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for fsbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + xfs_buf_iodone_t verify) +{ + struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ - int error; + int error; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, NULL); + mp->m_bsize, lock, &bp, verify); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -645,15 +647,16 @@ xfs_btree_read_bufl( /* ARGSUSED */ void xfs_btree_reada_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); } /* @@ -663,17 +666,18 @@ xfs_btree_reada_bufl( /* ARGSUSED */ void xfs_btree_reada_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); } STATIC int @@ -687,12 +691,14 @@ xfs_btree_readahead_lblock( xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1); + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->read_verify); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1); + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->read_verify); rval++; } @@ -712,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1); + left, 1, cur->bc_ops->read_verify); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1); + right, 1, cur->bc_ops->read_verify); rval++; } @@ -1016,19 +1022,15 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, NULL); + mp->m_bsize, flags, bpp, + cur->bc_ops->read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); - - error = xfs_btree_check_block(cur, *block, level, *bpp); - if (error) - xfs_trans_brelse(cur->bc_tp, *bpp); - return error; + return 0; } /* diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index c9cf2d00e23..3a4c314047a 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,6 +188,7 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + void (*read_verify)(struct xfs_buf *bp); #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, @@ -355,7 +356,8 @@ xfs_btree_read_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ - int refval);/* ref count value for buffer */ + int refval, /* ref count value for buffer */ + xfs_buf_iodone_t verify); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -365,7 +367,8 @@ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -376,7 +379,8 @@ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify); /* * Initialise a new btree block header diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 2b8b7a37aa1..11306c6d61c 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" STATIC int @@ -181,6 +182,44 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } +void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* magic number and level verification */ + level = be16_to_cpu(block->bb_level); + sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && + level < mp->m_in_maxlevels; + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_inobt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -218,6 +257,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, + .read_verify = xfs_inobt_read_verify, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 514eac913f1..3a243d07695 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,7 +382,7 @@ xfs_inobp_check( } #endif -static void +void xfs_inode_buf_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 21b4de3df71..1a892114792 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,6 +554,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_inode_buf_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 3998fd2a794..0f18d412e3e 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -396,7 +396,8 @@ xfs_bulkstat( if (xfs_inobt_maskn(chunkidx, nicluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); + agbno, nbcluster, + xfs_inode_buf_verify); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; -- cgit v1.2.3 From c6319198702350a2215a8c0cacd6cc4283728a1b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:50:13 +1100 Subject: xfs: verify dquot blocks as they are read from disk Add a dquot buffer verify callback function and pass it into the buffer read functions. This checks all the dquots in a buffer, but cannot completely verify the dquot ids are correct. Also, errors cannot be repaired, so an additional function is added to repair bad dquots in the buffer if such an error is detected in a context where repair is allowed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dquot.c | 117 +++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_dquot.h | 1 + fs/xfs/xfs_qm.c | 3 +- 3 files changed, 98 insertions(+), 23 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e95f800333d..0ba0f0992d6 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -360,6 +360,89 @@ xfs_qm_dqalloc( return (error); } +void +xfs_dquot_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR("xfs_dquot_read_verify", + XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return XFS_ERROR(error); + } + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_qm_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EIO); + } + } + + return 0; +} + /* * Maps a dquot to the buffer containing its on-disk version. * This returns a ptr to the buffer containing the on-disk dquot @@ -378,7 +461,6 @@ xfs_qm_dqtobp( xfs_buf_t *bp; xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); xfs_mount_t *mp = dqp->q_mount; - xfs_disk_dquot_t *ddq; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); xfs_trans_t *tp = (tpp ? *tpp : NULL); @@ -439,33 +521,24 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, NULL); - if (error || !bp) - return XFS_ERROR(error); - } + 0, &bp, xfs_dquot_read_verify); - ASSERT(xfs_buf_islocked(bp)); - - /* - * calculate the location of the dquot inside the buffer. - */ - ddq = bp->b_addr + dqp->q_bufoffset; + if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } - /* - * A simple sanity check in case we got a corrupted dquot... - */ - error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, - flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), - "dqtobp"); - if (error) { - if (!(flags & XFS_QMOPT_DQREPAIR)) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EIO); + if (error) { + ASSERT(bp == NULL); + return XFS_ERROR(error); } } + ASSERT(xfs_buf_islocked(bp)); *O_bpp = bp; - *O_ddpp = ddq; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; return (0); } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 7d20af27346..a08ba92d7da 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,6 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); +extern void xfs_dquot_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 688f608b366..a6dfb97490c 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,8 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + xfs_dquot_read_verify); if (error) break; -- cgit v1.2.3 From 4bb20a83a2a5ac4dcb62780c9950e47939956126 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:10 +1100 Subject: xfs: add verifier callback to directory read code Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 23 ++++++++++++----------- fs/xfs/xfs_attr_leaf.c | 18 +++++++++--------- fs/xfs/xfs_da_btree.c | 44 ++++++++++++++++++++++++++++---------------- fs/xfs/xfs_da_btree.h | 7 ++++--- fs/xfs/xfs_dir2_block.c | 23 ++++++++++++----------- fs/xfs/xfs_dir2_leaf.c | 33 ++++++++++++++++----------------- fs/xfs/xfs_dir2_node.c | 43 ++++++++++++++++++++----------------------- fs/xfs/xfs_file.c | 2 +- 8 files changed, 102 insertions(+), 91 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 474c57a43cc..cd5a9cd0ded 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -904,7 +904,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) dp = args->dp; args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1032,7 +1032,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * remove the "old" attr from that block (neat, huh!) */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1101,7 +1101,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) dp = args->dp; args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -1159,7 +1159,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1190,7 +1190,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK, + NULL); if (error) return XFS_ERROR(error); ASSERT(bp != NULL); @@ -1605,7 +1606,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) state->path.blk[0].bp = NULL; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) goto out; ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == @@ -1718,7 +1719,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) error = xfs_da_read_buf(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + &blk->bp, XFS_ATTR_FORK, NULL); if (error) return(error); } else { @@ -1737,7 +1738,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) error = xfs_da_read_buf(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + &blk->bp, XFS_ATTR_FORK, NULL); if (error) return(error); } else { @@ -1827,7 +1828,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) bp = NULL; if (cursor->blkno > 0) { error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { @@ -1870,7 +1871,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) for (;;) { error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); if (unlikely(bp == NULL)) { @@ -1937,7 +1938,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); if (unlikely((bp == NULL))) { diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 4bfc732bc9c..ba2b9a2cd23 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -871,7 +871,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) if (error) goto out; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) goto out; ASSERT(bp1 != NULL); @@ -1642,7 +1642,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) if (blkno == 0) continue; error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK); + blkno, -1, &bp, XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -2519,7 +2519,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set up the operation. */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2584,7 +2584,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Set up the operation. */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2641,7 +2641,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "old" attr */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2652,7 +2652,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ if (args->blkno2 != args->blkno) { error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK); + -1, &bp2, XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2753,7 +2753,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2839,7 +2839,7 @@ xfs_attr_node_inactive( * before we come back to this one. */ error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); if (child_bp) { @@ -2880,7 +2880,7 @@ xfs_attr_node_inactive( */ if ((i+1) < count) { error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4af8bad7068..f9e9149de00 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -747,7 +747,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, - args->whichfork); + args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -838,7 +838,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) if (blkno == 0) continue; error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, state->args->whichfork); + blkno, -1, &bp, state->args->whichfork, + NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1084,7 +1085,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) */ blk->blkno = blkno; error = xfs_da_read_buf(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork); + -1, &blk->bp, args->whichfork, NULL); if (error) { blk->blkno = 0; state->path.active--; @@ -1247,7 +1248,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (old_info->back) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1268,7 +1269,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (old_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1368,7 +1369,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (drop_info->back) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1385,7 +1386,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (drop_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1470,7 +1471,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ blk->blkno = blkno; error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork); + &blk->bp, args->whichfork, NULL); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1733,7 +1734,8 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL); + if (error) return error; /* * Copy the last block into the dead buffer and log it. @@ -1759,7 +1761,9 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, + NULL); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1780,7 +1784,9 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, + NULL); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1803,7 +1809,9 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, + NULL); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != @@ -1853,7 +1861,9 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, + NULL); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely( @@ -2139,7 +2149,8 @@ xfs_da_read_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, - int whichfork) + int whichfork, + xfs_buf_iodone_t verifier) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2161,7 +2172,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, NULL); + mapp, nmap, 0, &bp, verifier); if (error) goto out_free; @@ -2217,7 +2228,8 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, - int whichfork) + int whichfork, + xfs_buf_iodone_t verifier) { xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 132adafb041..bf8bfaa0d35 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -18,7 +18,6 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_buf; struct xfs_bmap_free; struct xfs_inode; struct xfs_mount; @@ -226,9 +225,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork, + xfs_buf_iodone_t verifier); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork); + xfs_dablk_t bno, int whichfork, + xfs_buf_iodone_t verifier); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e93ca8f054f..53666ca6c95 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -97,10 +97,10 @@ xfs_dir2_block_addname( /* * Read the (one and only) directory block into dabuf bp. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); hdr = bp->b_addr; /* @@ -457,7 +457,7 @@ xfs_dir2_block_getdents( * Can't read the block, give up, else get dabuf in bp. */ error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK, NULL); if (error) return error; @@ -640,10 +640,10 @@ xfs_dir2_block_lookup_int( /* * Read the buffer, return error if we can't get it. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); @@ -917,10 +917,11 @@ xfs_dir2_leaf_to_block( /* * Read the data block if we don't already have it, give up if it fails. */ - if (dbp == NULL && - (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK))) { - return error; + if (!dbp) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, + XFS_DATA_FORK, NULL); + if (error) + return error; } hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index bac86984e40..86e3dc1de0e 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -315,10 +315,9 @@ xfs_dir2_leaf_addname( * Read the leaf block. */ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(lbp != NULL); /* * Look up the entry by hash value and name. @@ -500,9 +499,9 @@ xfs_dir2_leaf_addname( * Just read that one in. */ else { - if ((error = - xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), + -1, &dbp, XFS_DATA_FORK, NULL); + if (error) { xfs_trans_brelse(tp, lbp); return error; } @@ -895,7 +894,7 @@ xfs_dir2_leaf_readbuf( error = xfs_da_read_buf(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK, NULL); /* * Should just skip over the data block instead of giving up. @@ -938,7 +937,7 @@ xfs_dir2_leaf_readbuf( xfs_da_reada_buf(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_DATA_FORK); + XFS_DATA_FORK, NULL); mip->ra_current = i; } @@ -1376,7 +1375,7 @@ xfs_dir2_leaf_lookup_int( * Read the leaf block into the buffer. */ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); + XFS_DATA_FORK, NULL); if (error) return error; *lbpp = lbp; @@ -1411,7 +1410,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1453,7 +1452,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1738,10 +1737,10 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, + XFS_DATA_FORK, NULL); + if (error) return error; - } leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -1864,10 +1863,10 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, + XFS_DATA_FORK, NULL); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 6c705240660..290c2b1016a 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -399,7 +399,7 @@ xfs_dir2_leafn_lookup_for_addname( */ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp, XFS_DATA_FORK, NULL); if (error) return error; free = curbp->b_addr; @@ -536,7 +536,7 @@ xfs_dir2_leafn_lookup_for_entry( } else { error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp, XFS_DATA_FORK, NULL); if (error) return error; } @@ -915,10 +915,10 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), + -1, &fbp, XFS_DATA_FORK, NULL); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == @@ -1169,11 +1169,10 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - if ((error = - xfs_da_read_buf(state->args->trans, state->args->dp, blkno, - -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); /* * Count bytes in the two blocks combined. @@ -1454,14 +1453,13 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - if ((error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), -2, + &fbp, XFS_DATA_FORK, NULL); + if (error) return error; - } - if (unlikely(fbp == NULL)) { + if (!fbp) continue; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; @@ -1520,9 +1518,9 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - if (unlikely(error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, + &fbp, XFS_DATA_FORK, NULL); + if (error) return error; /* @@ -1631,7 +1629,7 @@ xfs_dir2_node_addname_int( * Read the data block in. */ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) return error; hdr = dbp->b_addr; @@ -1917,11 +1915,10 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } - /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c42f99e71f1..f6dab7da7bc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -891,7 +891,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL); xfs_iunlock(ip, mode); return 0; } -- cgit v1.2.3 From 20f7e9f3726a27cccade65c28265eef8ca50eecb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:11 +1100 Subject: xfs: factor dir2 block read operations In preparation for verifying dir2 block format buffers, factor the read operations out of the block operations (lookup, addname, getdents) and some of the additional logic to make it easier to understand an dmodify the code. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 386 ++++++++++++++++++++++++++---------------------- 1 file changed, 209 insertions(+), 177 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 53666ca6c95..25ce409487b 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,178 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static int +xfs_dir2_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + + return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, NULL); +} + +static void +xfs_dir2_block_need_space( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = hdr->bestfree; + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + blp += be32_to_cpu(btp->stale) - 1; + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); +} + /* * Add an entry to a block directory. */ @@ -63,7 +235,6 @@ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { - xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* buffer for block */ @@ -94,134 +265,44 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the (one and only) directory block into dabuf bp. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, - XFS_DATA_FORK, NULL); + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir2_block_read(tp, dp, &bp); if (error) return error; - ASSERT(bp != NULL); - hdr = bp->b_addr; - /* - * Check the magic number, corrupted if wrong. - */ - if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + len = xfs_dir2_data_entsize(args->namelen); + /* * Set up pointers to parts of the block. */ - bf = hdr->bestfree; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + /* - * No stale entries? Need space for entry and new leaf. - */ - if (!btp->stale) { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then can't do this add without cleaning up: - * the space before the first leaf entry needs to be free so it - * can be expanded to hold the pointer to the new entry. - */ - if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) - dup = enddup = NULL; - /* - * Check out the biggest freespace and see if it's the same one. - */ - else { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - if (dup == enddup) { - /* - * It is the biggest freespace, is it too small - * to hold the new leaf too? - */ - if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { - /* - * Yes, we use the second-largest - * entry instead if it works. - */ - if (be16_to_cpu(bf[1].length) >= len) - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + - be16_to_cpu(bf[1].offset)); - else - dup = NULL; - } - } else { - /* - * Not the same free entry, - * just check its length. - */ - if (be16_to_cpu(dup->length) < len) { - dup = NULL; - } - } - } - compact = 0; - } - /* - * If there are stale entries we'll use one for the leaf. - * Is the biggest entry enough to avoid compaction? - */ - else if (be16_to_cpu(bf[0].length) >= len) { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - compact = 0; - } - /* - * Will need to compact to make this work. + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. */ - else { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then the data will go where the - * leaf data starts now, if it works at all. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * - (uint)sizeof(*blp) < len) - dup = NULL; - } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) - dup = NULL; - else - dup = (xfs_dir2_data_unused_t *)blp; - compact = 1; - } + xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + /* - * If this isn't a real add, we're done with the buffer. + * Done everything we need for a space check now. */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); + if (!dup) + return XFS_ERROR(ENOSPC); + return 0; + } + /* * If we don't have space for the new entry & leaf ... */ if (!dup) { - /* - * Not trying to actually do anything, or don't have - * a space reservation: return no-space. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. @@ -232,65 +313,24 @@ xfs_dir2_block_addname( return error; return xfs_dir2_leaf_addname(args); } - /* - * Just checking, and it would work, so say so. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + needlog = needscan = 0; + /* * If need to compact the leaf entries, do it now. - * Leave the highest-numbered stale entry stale. - * XXX should be the one closest to mid but mid is not yet computed. - */ - if (compact) { - int fromidx; /* source leaf index */ - int toidx; /* target leaf index */ - - for (fromidx = toidx = be32_to_cpu(btp->count) - 1, - highstale = lfloghigh = -1; - fromidx >= 0; - fromidx--) { - if (blp[fromidx].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { - if (highstale == -1) - highstale = toidx; - else { - if (lfloghigh == -1) - lfloghigh = toidx; - continue; - } - } - if (fromidx < toidx) - blp[toidx] = blp[fromidx]; - toidx--; - } - lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); - lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), - &needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; - btp->stale = cpu_to_be32(1); - /* - * If we now need to rebuild the bestfree map, do so. - * This needs to happen before the next call to use_free. - */ - if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); - needscan = 0; - } - } - /* - * Set leaf logging boundaries to impossible state. - * For the no-stale case they're set explicitly. */ + if (compact) + xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); else if (btp->stale) { + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } + /* * Find the slot that's first lower than our hash value, -1 if none. */ @@ -450,18 +490,13 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - } - /* - * Can't read the block, give up, else get dabuf in bp. - */ - error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK, NULL); + + error = xfs_dir2_block_read(NULL, dp, &bp); if (error) return error; - ASSERT(bp != NULL); /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -637,14 +672,11 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the buffer, return error if we can't get it. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_block_read(tp, dp, &bp); if (error) return error; - ASSERT(bp != NULL); + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); -- cgit v1.2.3 From 82025d7f79148fe66a1594a0ebe4ab38152cf9e6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:12 +1100 Subject: xfs: verify dir2 block format buffers Add a dir2 block format read verifier. To fully verify every block when read, call xfs_dir2_data_check() on them. Change xfs_dir2_data_check() to do runtime checking, convert ASSERT() checks to XFS_WANT_CORRUPTED_RETURN(), which will trigger an ASSERT failure on debug kernels, but on production kernels will dump an error to dmesg and return EFSCORRUPTED to the caller. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 22 ++++++++++++++- fs/xfs/xfs_dir2_data.c | 73 +++++++++++++++++++++++++++++-------------------- fs/xfs/xfs_dir2_priv.h | 4 ++- 3 files changed, 68 insertions(+), 31 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 25ce409487b..57351b86886 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,26 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static void +xfs_dir2_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + static int xfs_dir2_block_read( struct xfs_trans *tp, @@ -65,7 +85,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, NULL); + XFS_DATA_FORK, xfs_dir2_block_verify); } static void diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 44ffd4d6bc9..cb117234e32 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -34,14 +34,13 @@ STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); -#ifdef DEBUG /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Pop an assert if we find anything bad. + * Return 0 is the buffer is good, otherwise an error. */ -void -xfs_dir2_data_check( +int +__xfs_dir2_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -64,18 +63,23 @@ xfs_dir2_data_check( int stale; /* count of stale leaves */ struct xfs_name name; - mp = dp->i_mount; + mp = bp->b_target->bt_mount; hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else { - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return EFSCORRUPTED; } count = lastfree = freeseen = 0; @@ -83,19 +87,22 @@ xfs_dir2_data_check( * Account for zero bestfree entries. */ if (!bf[0].length) { - ASSERT(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - ASSERT(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - ASSERT(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); freeseen |= 1 << 2; } - ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. */ @@ -107,17 +114,20 @@ xfs_dir2_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(lastfree == 0); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, dup); if (dfp) { i = (int)(dfp - bf); - ASSERT((freeseen & (1 << i)) == 0); + XFS_WANT_CORRUPTED_RETURN( + (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - ASSERT(be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); } p += be16_to_cpu(dup->length); lastfree = 1; @@ -130,10 +140,12 @@ xfs_dir2_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - ASSERT(dep->namelen != 0); - ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN( + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { @@ -148,27 +160,30 @@ xfs_dir2_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - ASSERT(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - ASSERT(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + XFS_WANT_CORRUPTED_RETURN( + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); } - ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - ASSERT(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); } + return 0; } -#endif /* * Given a data block and an unused entry from that block, diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 3523d3e15aa..93b8f66ae78 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -41,10 +41,12 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); #else #define xfs_dir2_data_check(dp,bp) #endif +extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); + extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); -- cgit v1.2.3 From 2025207ca6738a1217126ef14af9d104433f9824 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:13 +1100 Subject: xfs: factor dir2 free block reading Also factor out the updating of the free block when removing entries from leaf blocks, and add a verifier callback for reads. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 3 +- fs/xfs/xfs_dir2_node.c | 218 +++++++++++++++++++++++++++++++------------------ fs/xfs/xfs_dir2_priv.h | 2 + 3 files changed, 143 insertions(+), 80 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 86e3dc1de0e..6c1359dc989 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1863,8 +1863,7 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); if (error) return error; free = fbp->b_addr; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 290c2b1016a..d7f899dfbff 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -55,6 +55,57 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); +static void +xfs_dir2_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", + XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static int +__xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_free_verify); +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); +} + /* * Log entries from a freespace block. */ @@ -394,12 +445,10 @@ xfs_dir2_leafn_lookup_for_addname( */ if (curbp) xfs_trans_brelse(tp, curbp); - /* - * Read the free block. - */ - error = xfs_da_read_buf(tp, dp, + + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK, NULL); + &curbp); if (error) return error; free = curbp->b_addr; @@ -825,6 +874,77 @@ xfs_dir2_leafn_rebalance( } } +static int +xfs_dir2_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + struct xfs_trans *tp = args->trans; + int logfree = 0; + + if (!hdr) { + /* One less used entry in the free table. */ + be32_add_cpu(&free->hdr.nused, -1); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If this was the last entry in the table, we can trim the + * table size back. There might be other entries at the end + * referring to non-existent data blocks, get those too. + */ + if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + free->hdr.nvalid = cpu_to_be32(i + 1); + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } else { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + free->bests[findex] = cpu_to_be16(longest); + logfree = 1; + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; +} + /* * Remove an entry from a node directory. * This removes the leaf entry and the data entry, @@ -908,15 +1028,14 @@ xfs_dir2_leafn_remove( xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ - int logfree; /* need to log free entry */ /* * Convert the data block number to a free block, * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + &fbp); if (error) return error; free = fbp->b_addr; @@ -954,68 +1073,12 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (hdr == NULL) { - /* - * One less used entry in the free table. - */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - /* - * If this was the last entry in the table, we can - * trim the table size back. There might be other - * entries at the end referring to non-existent - * data blocks, get those too. - */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ - - for (i = findex - 1; - i >= 0 && - free->bests[i] == cpu_to_be16(NULLDATAOFF); - i--) - continue; - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } - /* - * Not the last entry, just punch it out. - */ - else { - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ - } - } - /* - * Data block is not empty, just set the free entry to - * the new value. - */ - else { - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; - } - /* - * Log the free entry that changed, unless we got rid of it. - */ - if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + error = xfs_dir2_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; } + xfs_dir2_leafn_check(dp, bp); /* * Return indication of whether this leaf block is empty enough @@ -1453,9 +1516,9 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, - &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); if (error) return error; if (!fbp) @@ -1518,8 +1581,9 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, - &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); if (error) return error; @@ -1915,17 +1979,15 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); if (error) return error; /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. */ - if (bp == NULL) { + if (!bp) return 0; - } free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 93b8f66ae78..263a6328791 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -117,6 +117,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); -- cgit v1.2.3 From e4813572640e27d3a5cce3f06751a9f54f77aaa5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:14 +1100 Subject: xfs: factor out dir2 data block reading And add a verifier callback function while there. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 3 +-- fs/xfs/xfs_dir2_data.c | 32 ++++++++++++++++++++++++++++++++ fs/xfs/xfs_dir2_leaf.c | 38 +++++++++++++++++--------------------- fs/xfs/xfs_dir2_node.c | 8 ++++---- fs/xfs/xfs_dir2_priv.h | 2 ++ 5 files changed, 56 insertions(+), 27 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 57351b86886..ca03b109772 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -970,8 +970,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); if (error) return error; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index cb117234e32..0ef04f1bf51 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,6 +185,38 @@ __xfs_dir2_data_check( return 0; } +static void +xfs_dir2_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_dir2_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, xfs_dir2_data_verify); +} + /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 6c1359dc989..0fdf765c917 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -493,14 +493,14 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; - } - /* - * Already had space in some data block. - * Just read that one in. - */ - else { - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK, NULL); + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, use_block), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -508,7 +508,6 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; grown = 0; } - xfs_dir2_data_check(dp, dbp); /* * Point to the biggest freespace in our data block. */ @@ -891,10 +890,9 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, + error = xfs_dir2_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK, NULL); + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); /* * Should just skip over the data block instead of giving up. @@ -1408,14 +1406,13 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } - xfs_dir2_data_check(dp, dbp); curdb = newdb; } /* @@ -1450,9 +1447,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1737,8 +1734,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); if (error) return error; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index d7f899dfbff..67b811c17ea 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -583,9 +583,9 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_da_read_buf(tp, dp, + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK, NULL); + -1, &curbp); if (error) return error; } @@ -1692,8 +1692,8 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp); if (error) return error; hdr = dbp->b_addr; diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 263a6328791..71ec8283910 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -46,6 +46,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #define xfs_dir2_data_check(dp,bp) #endif extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, -- cgit v1.2.3 From e6f7667c4eef42b6f5bc6cdeb31d0bab62fe5f79 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:15 +1100 Subject: xfs: factor dir2 leaf read Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 73 ++++++++++++++++++++++++++++++++++++++++++-------- fs/xfs/xfs_dir2_node.c | 6 ++--- fs/xfs/xfs_dir2_priv.h | 2 ++ 3 files changed, 67 insertions(+), 14 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0fdf765c917..97408e3287e 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -48,6 +48,62 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void +xfs_dir2_leaf_verify( + struct xfs_buf *bp, + __be16 magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == magic; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static void +xfs_dir2_leaf1_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +static void +xfs_dir2_leafn_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +static int +xfs_dir2_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_leaf1_verify); +} + +int +xfs_dir2_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_leafn_verify); +} /* * Convert a block form directory to a leaf form directory. @@ -311,14 +367,11 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; - ASSERT(lbp != NULL); + /* * Look up the entry by hash value and name. * We know it's not there, our caller has already done a lookup. @@ -1369,13 +1422,11 @@ xfs_dir2_leaf_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block into the buffer. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; + *lbpp = lbp; leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 67b811c17ea..7c6f95697e2 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1232,11 +1232,11 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_DATA_FORK, NULL); + error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return error; - ASSERT(bp != NULL); + /* * Count bytes in the two blocks combined. */ diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 71ec8283910..4560825d099 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -70,6 +70,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -- cgit v1.2.3 From ad14c33ac862601c4c22755ed3b59f1906b134e5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:16 +1100 Subject: xfs: factor and verify attr leaf reads Some reads are not converted yet because it isn't obvious ahead of time what the format of the block is going to be. Need to determine how to tell if the first block in the tree is a node or leaf format block. That will be done in later patches. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 70 ++++++++++++-------------------------------- fs/xfs/xfs_attr_leaf.c | 78 +++++++++++++++++++++++++++++--------------------- fs/xfs/xfs_attr_leaf.h | 3 ++ 3 files changed, 66 insertions(+), 85 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index cd5a9cd0ded..d644915367e 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; /* * Look up the given attribute in the leaf block. Figure out if @@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); - (void)xfs_attr_leaf_remove(bp, args); + return error; + + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; - ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); return(error); } - (void)xfs_attr_leaf_remove(bp, args); + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1158,11 +1153,9 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { @@ -1183,25 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args) STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context) { - xfs_attr_leafblock_t *leaf; int error; struct xfs_buf *bp; trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK, - NULL); + error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - ASSERT(bp != NULL); - leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); - } error = xfs_attr_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); @@ -1605,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); @@ -1920,14 +1900,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != - cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } error = xfs_attr_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); @@ -1937,16 +1909,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + &bp); if (error) - return(error); - if (unlikely((bp == NULL))) { - XFS_ERROR_REPORT("xfs_attr_node_list(5)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } + return error; } xfs_trans_brelse(NULL, bp); return(0); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ba2b9a2cd23..357971536d5 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,6 +88,36 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +static void +xfs_attr_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_attr_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, xfs_attr_leaf_verify); +} + /*======================================================================== * Namespace helper routines *========================================================================*/ @@ -870,11 +900,10 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); if (error) goto out; - ASSERT(bp1 != NULL); + bp2 = NULL; error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, XFS_ATTR_FORK); @@ -1641,18 +1670,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return(error); - ASSERT(bp != NULL); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); @@ -2518,15 +2545,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2583,15 +2606,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2640,35 +2659,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } - ASSERT(bp1 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } - ASSERT(bp2 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; } else { bp2 = bp1; } leaf1 = bp1->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; leaf2 = bp2->b_addr; - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); entry2 = &leaf2->entries[ args->index2 ]; diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index dea17722945..8f7ab986f45 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -227,6 +227,9 @@ int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); /* * Routines used for growing the Btree. -- cgit v1.2.3 From d9392a4bb75503fc2adbb5237c3df940c6467eb2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:17 +1100 Subject: xfs: add xfs_da_node verification Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 22 ++++------ fs/xfs/xfs_attr_leaf.c | 12 +++--- fs/xfs/xfs_attr_leaf.h | 8 ++-- fs/xfs/xfs_da_btree.c | 109 +++++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_da_btree.h | 3 ++ fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dir2_priv.h | 1 + 7 files changed, 107 insertions(+), 50 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index d644915367e..aaf472532b3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1696,10 +1696,10 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK, NULL); + &blk->bp, XFS_ATTR_FORK); if (error) return(error); } else { @@ -1715,10 +1715,10 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK, NULL); + &blk->bp, XFS_ATTR_FORK); if (error) return(error); } else { @@ -1807,8 +1807,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { @@ -1849,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_read_buf(NULL, context->dp, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + XFS_ATTR_FORK); if (error) return(error); - if (unlikely(bp == NULL)) { - XFS_ERROR_REPORT("xfs_attr_node_list(2)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 357971536d5..efe170da288 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void +void xfs_attr_leaf_verify( struct xfs_buf *bp) { @@ -2765,7 +2765,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2850,8 +2850,8 @@ xfs_attr_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); if (error) return(error); if (child_bp) { @@ -2891,8 +2891,8 @@ xfs_attr_node_inactive( * child block number. */ if ((i+1) < count) { - error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 8f7ab986f45..098e9a58ad9 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -227,9 +227,6 @@ int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); int xfs_attr_leaf_flipflags(xfs_da_args_t *args); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp); /* * Routines used for growing the Btree. @@ -264,4 +261,9 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); +void xfs_attr_leaf_verify(struct xfs_buf *bp); + #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index f9e9149de00..1b84fc50a05 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -91,6 +91,68 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *save_blk); STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); +static void +__xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_node_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); + block_ok = block_ok && + be16_to_cpu(hdr->level) > 0 && + be16_to_cpu(hdr->count) > 0 ; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static void +xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + __xfs_da_node_verify(bp); + return; + case XFS_ATTR_LEAF_MAGIC: + xfs_attr_leaf_verify(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + xfs_dir2_leafn_verify(bp); + return; + default: + break; + } + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_da_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, xfs_da_node_verify); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -746,8 +808,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) */ child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); - error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, - args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -837,9 +899,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, state->args->whichfork, - NULL); + error = xfs_da_node_read(state->args->trans, state->args->dp, + blkno, -1, &bp, state->args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1084,8 +1145,8 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; @@ -1246,9 +1307,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1267,9 +1328,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1367,9 +1428,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1384,9 +1445,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1470,8 +1531,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1734,7 +1795,7 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL); + error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); if (error) return error; /* @@ -1761,8 +1822,7 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1784,8 +1844,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1809,8 +1868,7 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; @@ -1861,8 +1919,7 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index bf8bfaa0d35..2d1bec4b759 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -213,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); +int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); /* * Utility routines. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 97408e3287e..67cc21c2a45 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -74,7 +74,7 @@ xfs_dir2_leaf1_verify( xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); } -static void +void xfs_dir2_leafn_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 4560825d099..e0b96e7693e 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -70,6 +70,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern void xfs_dir2_leafn_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, -- cgit v1.2.3 From da6958c873ecd846d71fafbfe0f6168bb9c2c99e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:18 +1100 Subject: xfs: Add verifiers to dir2 data readahead. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_da_btree.h | 4 ++-- fs/xfs/xfs_dir2_data.c | 13 ++++++++++++- fs/xfs/xfs_dir2_leaf.c | 11 +++++------ fs/xfs/xfs_dir2_priv.h | 2 ++ fs/xfs/xfs_file.c | 4 +++- 6 files changed, 26 insertions(+), 12 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 1b84fc50a05..93ebc0fc6dd 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2285,10 +2285,10 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mappedbno, int whichfork, xfs_buf_iodone_t verifier) { - xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; struct xfs_buf_map *mapp; int nmap; @@ -2296,7 +2296,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 2d1bec4b759..521b008445a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -231,8 +231,8 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bpp, int whichfork, xfs_buf_iodone_t verifier); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork, - xfs_buf_iodone_t verifier); + xfs_dablk_t bno, xfs_daddr_t mapped_bno, + int whichfork, xfs_buf_iodone_t verifier); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 0ef04f1bf51..1a43c8593c0 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,7 +185,7 @@ __xfs_dir2_data_check( return 0; } -static void +void xfs_dir2_data_verify( struct xfs_buf *bp) { @@ -217,6 +217,17 @@ xfs_dir2_data_read( XFS_DATA_FORK, xfs_dir2_data_verify); } +int +xfs_dir2_data_readahead( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + XFS_DATA_FORK, xfs_dir2_data_verify); +} + /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 67cc21c2a45..8a95547d42a 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -972,11 +972,11 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, + xfs_dir2_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + - mip->ra_offset), - (int)BTOBB(mp->m_dirblksize), NULL); + mip->ra_offset)); mip->ra_current = i; } @@ -985,10 +985,9 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_da_reada_buf(NULL, dp, + xfs_dir2_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + - mip->ra_offset, - XFS_DATA_FORK, NULL); + mip->ra_offset, -1); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index e0b96e7693e..daf5d0fc616 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -48,6 +48,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f6dab7da7bc..400b187595b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,8 @@ #include "xfs_error.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" @@ -891,7 +893,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL); + xfs_dir2_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } -- cgit v1.2.3 From cfb02852226aa449fe27075caffe88726507668c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:19 +1100 Subject: xfs: add buffer pre-write callback Add a callback to the buffer write path to enable verification of the buffer and CRC calculation prior to issuing the write to the underlying storage. If the callback function detects some kind of failure or error condition, it must mark the buffer with an error so that the caller can take appropriate action. In the case of xfs_buf_ioapply(), a corrupt metadta buffer willt rigger a shutdown of the filesystem, because something is clearly wrong and we can't allow corrupt metadata to be written to disk. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 16 ++++++++++++++++ fs/xfs/xfs_buf.h | 3 +++ 2 files changed, 19 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index fbc965fc075..bd1a948ee39 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -569,7 +569,9 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_pre_io = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -1323,6 +1325,20 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; + /* + * run the pre-io callback function if it exists. If this function + * fails it will mark the buffer with an error and the IO should + * not be dispatched. + */ + if (bp->b_pre_io) { + bp->b_pre_io(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 677b1dc822f..51bc16a1cd9 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -155,6 +155,9 @@ typedef struct xfs_buf { unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + void (*b_pre_io)(struct xfs_buf *); + /* pre-io callback function */ + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif -- cgit v1.2.3 From 612cfbfe174a89d565363fff7f3961a2dda5fb71 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:52:32 +1100 Subject: xfs: add pre-write metadata buffer verifier callbacks These verifiers are essentially the same code as the read verifiers, but do not require ioend processing. Hence factor the read verifier functions and add a new write verifier wrapper that is used as the callback. This is done as one large patch for all verifiers rather than one patch per verifier as the change is largely mechanical. This includes hooking up the write verifier via the read verifier function. Hooking up the write verifier for buffers obtained via xfs_trans_get_buf() will be done in a separate patch as that touches code in many different places rather than just the verifier functions. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 35 ++++++++++++++++++++++++++++++++--- fs/xfs/xfs_alloc_btree.c | 21 +++++++++++++++++---- fs/xfs/xfs_attr_leaf.c | 19 +++++++++++++++++-- fs/xfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_bmap_btree.c | 21 +++++++++++++++++---- fs/xfs/xfs_da_btree.c | 37 +++++++++++++++++++++++++------------ fs/xfs/xfs_dir2_block.c | 16 +++++++++++++++- fs/xfs/xfs_dir2_data.c | 19 +++++++++++++++++-- fs/xfs/xfs_dir2_leaf.c | 31 ++++++++++++++++++++++++------- fs/xfs/xfs_dir2_node.c | 17 ++++++++++++++++- fs/xfs/xfs_dir2_priv.h | 2 +- fs/xfs/xfs_dquot.c | 27 +++++++++++++++++++++------ fs/xfs/xfs_dquot.h | 2 +- fs/xfs/xfs_ialloc.c | 17 ++++++++++++++++- fs/xfs/xfs_ialloc_btree.c | 19 ++++++++++++++++--- fs/xfs/xfs_inode.c | 19 +++++++++++++++++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_mount.c | 19 +++++++++++++++++-- fs/xfs/xfs_qm.c | 2 +- 20 files changed, 273 insertions(+), 56 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 38b4ab8957f..d12bbedf6fe 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,8 +430,8 @@ xfs_alloc_fixup_trees( return 0; } -void -xfs_agfl_read_verify( +static void +xfs_agfl_verify( struct xfs_buf *bp) { #ifdef WHEN_CRCS_COME_ALONG @@ -463,6 +463,21 @@ xfs_agfl_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } #endif +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); + bp->b_pre_io = xfs_agfl_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -2129,7 +2144,7 @@ xfs_alloc_put_freelist( } static void -xfs_agf_read_verify( +xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -2164,7 +2179,21 @@ xfs_agf_read_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} +void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); + bp->b_pre_io = xfs_agf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 46961e52e9b..6e98b22ebde 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,8 +272,8 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -void -xfs_allocbt_read_verify( +static void +xfs_allocbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -323,11 +323,24 @@ xfs_allocbt_read_verify( if (!sblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); + bp->b_pre_io = xfs_allocbt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index efe170da288..57729d71ab1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -void +static void xfs_attr_leaf_verify( struct xfs_buf *bp) { @@ -101,11 +101,26 @@ xfs_attr_leaf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_attr_leaf_write_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} +void +xfs_attr_leaf_read_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); + bp->b_pre_io = xfs_attr_leaf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + int xfs_attr_leaf_read( struct xfs_trans *tp, @@ -115,7 +130,7 @@ xfs_attr_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, xfs_attr_leaf_verify); + XFS_ATTR_FORK, xfs_attr_leaf_read_verify); } /*======================================================================== diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 098e9a58ad9..3bbf6277e43 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -264,6 +264,6 @@ int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr_leaf_verify(struct xfs_buf *bp); +void xfs_attr_leaf_read_verify(struct xfs_buf *bp); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index bddca9b9286..17d7423e750 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -708,8 +708,8 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -void -xfs_bmbt_read_verify( +static void +xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -744,11 +744,24 @@ xfs_bmbt_read_verify( if (!lblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); + bp->b_pre_io = xfs_bmbt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 93ebc0fc6dd..6bb0a59eaae 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -92,7 +92,7 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); static void -__xfs_da_node_verify( +xfs_da_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -108,12 +108,17 @@ __xfs_da_node_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } static void -xfs_da_node_verify( +xfs_da_node_write_verify( + struct xfs_buf *bp) +{ + xfs_da_node_verify(bp); +} + +static void +xfs_da_node_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -121,21 +126,22 @@ xfs_da_node_verify( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: - __xfs_da_node_verify(bp); - return; + xfs_da_node_verify(bp); + break; case XFS_ATTR_LEAF_MAGIC: - xfs_attr_leaf_verify(bp); + xfs_attr_leaf_read_verify(bp); return; case XFS_DIR2_LEAFN_MAGIC: - xfs_dir2_leafn_verify(bp); + xfs_dir2_leafn_read_verify(bp); return; default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); - + bp->b_pre_io = xfs_da_node_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -150,7 +156,7 @@ xfs_da_node_read( int which_fork) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, xfs_da_node_verify); + which_fork, xfs_da_node_read_verify); } /*======================================================================== @@ -816,7 +822,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the pre_io pointer as well to match the buffer type change + * that could occur. + */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + root_blk->bp->b_pre_io = bp->b_pre_io; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index ca03b109772..0f8793c74fe 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -71,7 +71,21 @@ xfs_dir2_block_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_dir2_block_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +void +xfs_dir2_block_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); + bp->b_pre_io = xfs_dir2_block_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -85,7 +99,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, xfs_dir2_block_verify); + XFS_DATA_FORK, xfs_dir2_block_read_verify); } static void diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 1a43c8593c0..b555585f5ab 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -200,11 +200,26 @@ xfs_dir2_data_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_dir2_data_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} +void +xfs_dir2_data_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); + bp->b_pre_io = xfs_dir2_data_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + int xfs_dir2_data_read( struct xfs_trans *tp, @@ -214,7 +229,7 @@ xfs_dir2_data_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, xfs_dir2_data_verify); + XFS_DATA_FORK, xfs_dir2_data_read_verify); } int @@ -225,7 +240,7 @@ xfs_dir2_data_readahead( xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, xfs_dir2_data_verify); + XFS_DATA_FORK, xfs_dir2_data_read_verify); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 8a95547d42a..5b3bcab2a65 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -62,23 +62,40 @@ xfs_dir2_leaf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_dir2_leaf1_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} +static void +xfs_dir2_leaf1_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + bp->b_pre_io = xfs_dir2_leaf1_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } static void -xfs_dir2_leaf1_verify( - struct xfs_buf *bp) +xfs_dir2_leafn_write_verify( + struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } void -xfs_dir2_leafn_verify( - struct xfs_buf *bp) +xfs_dir2_leafn_read_verify( + struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + bp->b_pre_io = xfs_dir2_leafn_write_verify; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); } static int @@ -90,7 +107,7 @@ xfs_dir2_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leaf1_verify); + XFS_DATA_FORK, xfs_dir2_leaf1_read_verify); } int @@ -102,7 +119,7 @@ xfs_dir2_leafn_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leafn_verify); + XFS_DATA_FORK, xfs_dir2_leafn_read_verify); } /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 7c6f95697e2..a58abe1fc0d 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -69,11 +69,26 @@ xfs_dir2_free_verify( XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_dir2_free_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} +void +xfs_dir2_free_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); + bp->b_pre_io = xfs_dir2_free_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + static int __xfs_dir2_free_read( struct xfs_trans *tp, @@ -83,7 +98,7 @@ __xfs_dir2_free_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_free_verify); + XFS_DATA_FORK, xfs_dir2_free_read_verify); } int diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index daf5d0fc616..7ec61af8449 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -72,7 +72,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern void xfs_dir2_leafn_verify(struct xfs_buf *bp); +extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0ba0f0992d6..b38a10e6f2e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -360,8 +360,8 @@ xfs_qm_dqalloc( return (error); } -void -xfs_dquot_read_verify( +static void +xfs_dquot_buf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -388,12 +388,26 @@ xfs_dquot_read_verify( error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, "xfs_dquot_read_verify"); if (error) { - XFS_CORRUPTION_ERROR("xfs_dquot_read_verify", - XFS_ERRLEVEL_LOW, mp, d); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); xfs_buf_ioerror(bp, EFSCORRUPTED); break; } } +} + +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} + +void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); + bp->b_pre_io = xfs_dquot_buf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -413,7 +427,7 @@ xfs_qm_dqrepair( /* * Read the buffer without verification so we get the corrupted - * buffer returned to us. + * buffer returned to us. make sure we verify it on write, though. */ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, @@ -423,6 +437,7 @@ xfs_qm_dqrepair( ASSERT(*bpp == NULL); return XFS_ERROR(error); } + (*bpp)->b_pre_io = xfs_dquot_buf_write_verify; ASSERT(xfs_buf_islocked(*bpp)); d = (struct xfs_dqblk *)(*bpp)->b_addr; @@ -521,7 +536,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, xfs_dquot_read_verify); + 0, &bp, xfs_dquot_buf_read_verify); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index a08ba92d7da..5438d883b62 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,7 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); -extern void xfs_dquot_read_verify(struct xfs_buf *bp); +extern void xfs_dquot_buf_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5bd255e5f7b..070f4184557 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1473,7 +1473,7 @@ xfs_check_agi_unlinked( #endif static void -xfs_agi_read_verify( +xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -1502,6 +1502,21 @@ xfs_agi_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } xfs_check_agi_unlinked(agi); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); + bp->b_pre_io = xfs_agi_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 11306c6d61c..15a79f8ca03 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -183,7 +183,7 @@ xfs_inobt_key_diff( } void -xfs_inobt_read_verify( +xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -211,11 +211,24 @@ xfs_inobt_read_verify( if (!sblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_inobt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} + +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} +void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); + bp->b_pre_io = xfs_inobt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3a243d07695..910b2da0104 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,7 +382,7 @@ xfs_inobp_check( } #endif -void +static void xfs_inode_buf_verify( struct xfs_buf *bp) { @@ -418,6 +418,21 @@ xfs_inode_buf_verify( } } xfs_inobp_check(mp, bp); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} + +void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); + bp->b_pre_io = xfs_inode_buf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -447,7 +462,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp, - xfs_inode_buf_verify); + xfs_inode_buf_read_verify); if (error) { if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1a892114792..a322c19723a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,7 +554,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); -void xfs_inode_buf_verify(struct xfs_buf *); +void xfs_inode_buf_read_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 0f18d412e3e..7f86fdaab7a 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -397,7 +397,7 @@ xfs_bulkstat( & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster, - xfs_inode_buf_verify); + xfs_inode_buf_read_verify); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bff18d73c61..c85da75e4a4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -612,8 +612,8 @@ xfs_sb_to_disk( } } -void -xfs_sb_read_verify( +static void +xfs_sb_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -629,6 +629,21 @@ xfs_sb_read_verify( error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); if (error) xfs_buf_ioerror(bp, error); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); + bp->b_pre_io = xfs_sb_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a6dfb97490c..bd40ae9624e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, - xfs_dquot_read_verify); + xfs_dquot_buf_read_verify); if (error) break; -- cgit v1.2.3 From b0f539de9fcc543a3ffa40bc22bf51aca6ea6183 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:53:49 +1100 Subject: xfs: connect up write verifiers to new buffers Metadata buffers that are read from disk have write verifiers already attached to them, but newly allocated buffers do not. Add appropriate write verifiers to all new metadata buffers. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 8 ++-- fs/xfs/xfs_alloc.h | 3 ++ fs/xfs/xfs_alloc_btree.c | 1 + fs/xfs/xfs_attr_leaf.c | 4 +- fs/xfs/xfs_bmap.c | 2 + fs/xfs/xfs_bmap_btree.c | 3 +- fs/xfs/xfs_bmap_btree.h | 1 + fs/xfs/xfs_btree.c | 1 + fs/xfs/xfs_btree.h | 2 + fs/xfs/xfs_da_btree.c | 3 ++ fs/xfs/xfs_dir2_block.c | 2 + fs/xfs/xfs_dir2_data.c | 11 +++-- fs/xfs/xfs_dir2_leaf.c | 19 +++++---- fs/xfs/xfs_dir2_node.c | 24 +++++++---- fs/xfs/xfs_dir2_priv.h | 2 + fs/xfs/xfs_dquot.c | 104 +++++++++++++++++++++++----------------------- fs/xfs/xfs_fsops.c | 8 +++- fs/xfs/xfs_ialloc.c | 5 ++- fs/xfs/xfs_ialloc.h | 4 +- fs/xfs/xfs_ialloc_btree.c | 1 + fs/xfs/xfs_inode.c | 14 ++++++- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 1 + 24 files changed, 137 insertions(+), 89 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index d12bbedf6fe..545a6c4c236 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -465,14 +465,14 @@ xfs_agfl_verify( #endif } -static void +void xfs_agfl_write_verify( struct xfs_buf *bp) { xfs_agfl_verify(bp); } -void +static void xfs_agfl_read_verify( struct xfs_buf *bp) { @@ -2181,14 +2181,14 @@ xfs_agf_verify( } } -static void +void xfs_agf_write_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); } -void +static void xfs_agf_read_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index feacb061bab..f32811f50f4 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +void xfs_agf_write_verify(struct xfs_buf *bp); +void xfs_agfl_write_verify(struct xfs_buf *bp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 6e98b22ebde..b8339652491 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -401,6 +401,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .read_verify = xfs_allocbt_read_verify, + .write_verify = xfs_allocbt_write_verify, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 57729d71ab1..5cd5b0c1d17 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -924,7 +924,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - ASSERT(bp2 != NULL); + bp2->b_pre_io = bp1->b_pre_io; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -978,7 +978,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - ASSERT(bp != NULL); + bp->b_pre_io = xfs_attr_leaf_write_verify; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 9ae7aba52e0..6a0f3f9f39d 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3124,6 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ + abp->b_pre_io = xfs_bmbt_write_verify; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3270,6 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bp->b_pre_io = xfs_bmbt_write_verify; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 17d7423e750..79758e1e4f7 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -749,7 +749,7 @@ xfs_bmbt_verify( } } -static void +void xfs_bmbt_write_verify( struct xfs_buf *bp) { @@ -806,6 +806,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .read_verify = xfs_bmbt_read_verify, + .write_verify = xfs_bmbt_write_verify, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 1d00fbe9dd7..938c8598654 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -233,6 +233,7 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern void xfs_bmbt_read_verify(struct xfs_buf *bp); +extern void xfs_bmbt_write_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index ef1066078c3..1e2d89eed2a 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -996,6 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; + (*bpp)->b_pre_io = cur->bc_ops->write_verify; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 3a4c314047a..458ab355089 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -189,6 +189,8 @@ struct xfs_btree_ops { union xfs_btree_key *key); void (*read_verify)(struct xfs_buf *bp); + void (*write_verify)(struct xfs_buf *bp); + #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 6bb0a59eaae..087950fc2eb 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -193,6 +193,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + bp->b_pre_io = xfs_da_node_write_verify; *bpp = bp; return(0); } @@ -392,6 +393,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, } memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_pre_io = blk1->bp->b_pre_io; blk1->bp = bp; blk1->blkno = blkno; diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 0f8793c74fe..e2fdc6f03d8 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1010,6 +1010,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ + dbp->b_pre_io = xfs_dir2_block_write_verify; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1139,6 +1140,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } + bp->b_pre_io = xfs_dir2_block_write_verify; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index b555585f5ab..dcb8a873ab9 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,7 +185,7 @@ __xfs_dir2_data_check( return 0; } -void +static void xfs_dir2_data_verify( struct xfs_buf *bp) { @@ -202,14 +202,14 @@ xfs_dir2_data_verify( } } -static void +void xfs_dir2_data_write_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); } -void +static void xfs_dir2_data_read_verify( struct xfs_buf *bp) { @@ -482,10 +482,9 @@ xfs_dir2_data_init( */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); - if (error) { + if (error) return error; - } - ASSERT(bp != NULL); + bp->b_pre_io = xfs_dir2_data_write_verify; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 5b3bcab2a65..3002ab7d54c 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -81,7 +81,7 @@ xfs_dir2_leaf1_read_verify( xfs_buf_ioend(bp, 0); } -static void +void xfs_dir2_leafn_write_verify( struct xfs_buf *bp) { @@ -198,6 +198,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ + dbp->b_pre_io = xfs_dir2_data_write_verify; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -1243,15 +1244,14 @@ xfs_dir2_leaf_init( * Get the buffer for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(bp != NULL); - leaf = bp->b_addr; + /* * Initialize the header. */ + leaf = bp->b_addr; leaf->hdr.info.magic = cpu_to_be16(magic); leaf->hdr.info.forw = 0; leaf->hdr.info.back = 0; @@ -1264,10 +1264,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { + bp->b_pre_io = xfs_dir2_leaf1_write_verify; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); - } + } else + bp->b_pre_io = xfs_dir2_leafn_write_verify; *bpp = bp; return 0; } @@ -1951,7 +1953,10 @@ xfs_dir2_node_to_leaf( xfs_dir2_leaf_compact(args, lbp); else xfs_dir2_leaf_log_header(tp, lbp); + + lbp->b_pre_io = xfs_dir2_leaf1_write_verify; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + /* * Set up the leaf tail from the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index a58abe1fc0d..da90a91f442 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -197,11 +197,12 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_pre_io = xfs_dir2_free_write_verify; + free = fbp->b_addr; leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -223,7 +224,10 @@ xfs_dir2_leaf_to_node( *to = cpu_to_be16(off); } free->hdr.nused = cpu_to_be32(n); + + lbp->b_pre_io = xfs_dir2_leafn_write_verify; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* * Log everything. */ @@ -632,6 +636,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_pre_io = xfs_dir2_data_write_verify; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -646,6 +651,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_pre_io = xfs_dir2_data_write_verify; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -1638,12 +1644,12 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - if ((error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + -1, &fbp, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_pre_io = xfs_dir2_free_write_verify; /* * Initialize the new block to be empty, and remember diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7ec61af8449..01b82dcddc3 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -45,6 +45,7 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #else #define xfs_dir2_data_check(dp,bp) #endif +extern void xfs_dir2_data_write_verify(struct xfs_buf *bp); extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -73,6 +74,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, /* xfs_dir2_leaf.c */ extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); +extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b38a10e6f2e..1b06aa05107 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -248,7 +248,57 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } +} + +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} +void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); + bp->b_pre_io = xfs_dquot_buf_write_verify; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} /* * Allocate a block and fill it with dquots. @@ -315,6 +365,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; + bp->b_pre_io = xfs_dquot_buf_write_verify; /* * Make a chunk of dquots out of this buffer and log @@ -359,59 +410,6 @@ xfs_qm_dqalloc( return (error); } - -static void -xfs_dquot_buf_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; - xfs_dqid_t id = 0; - int i; - - /* - * On the first read of the buffer, verify that each dquot is valid. - * We don't know what the id of the dquot is supposed to be, just that - * they should be increasing monotonically within the buffer. If the - * first id is corrupt, then it will fail on the second dquot in the - * buffer so corruptions could point to the wrong dquot in this case. - */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; - - ddq = &d[i].dd_diskdq; - - if (i == 0) - id = be32_to_cpu(ddq->d_id); - - error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } - } -} - -static void -xfs_dquot_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); -} - -void -xfs_dquot_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); - bp->b_pre_io = xfs_dquot_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} - STATIC int xfs_qm_dqrepair( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb65b067ed3..5d6d6b9d369 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -222,6 +222,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agf_write_verify; agf = XFS_BUF_TO_AGF(bp); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -259,6 +260,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agfl_write_verify; agfl = XFS_BUF_TO_AGFL(bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) @@ -279,6 +281,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agi_write_verify; agi = XFS_BUF_TO_AGI(bp); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -450,9 +453,10 @@ xfs_growfs_data_private( bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0); - if (bp) + if (bp) { xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - else + bp->b_pre_io = xfs_sb_write_verify; + } else error = ENOMEM; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 070f4184557..faf68600d3a 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -210,6 +210,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ + fbuf->b_pre_io = xfs_inode_buf_write_verify; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -1504,14 +1505,14 @@ xfs_agi_verify( xfs_check_agi_unlinked(agi); } -static void +void xfs_agi_write_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); } -void +static void xfs_agi_read_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 1fd6ea4e9c9..7a169e34e30 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +void xfs_agi_write_verify(struct xfs_buf *bp); + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 15a79f8ca03..7761e1ebeff 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -271,6 +271,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .read_verify = xfs_inobt_read_verify, + .write_verify = xfs_inobt_write_verify, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 910b2da0104..dfcbe73f1db 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -420,7 +420,7 @@ xfs_inode_buf_verify( xfs_inobp_check(mp, bp); } -static void +void xfs_inode_buf_write_verify( struct xfs_buf *bp) { @@ -1782,6 +1782,18 @@ xfs_ifree_cluster( if (!bp) return ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_pre_io = xfs_inode_buf_write_verify; + /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a322c19723a..482214d120a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -555,6 +555,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_inode_buf_read_verify(struct xfs_buf *); +void xfs_inode_buf_write_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c85da75e4a4..152a7fc843f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -631,7 +631,7 @@ xfs_sb_verify( xfs_buf_ioerror(bp, error); } -static void +void xfs_sb_write_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index de9089acc61..29c1b3ac920 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -386,6 +386,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ extern void xfs_sb_read_verify(struct xfs_buf *); +extern void xfs_sb_write_verify(struct xfs_buf *bp); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -- cgit v1.2.3 From 1813dd64057490e7a0678a885c4fe6d02f78bdc1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:54:40 +1100 Subject: xfs: convert buffer verifiers to an ops structure. To separate the verifiers from iodone functions and associate read and write verifiers at the same time, introduce a buffer verifier operations structure to the xfs_buf. This avoids the need for assigning the write verifier, clearing the iodone function and re-running ioend processing in the read verifier, and gets rid of the nasty "b_pre_io" name for the write verifier function pointer. If we ever need to, it will also be easier to add further content specific callbacks to a buffer with an ops structure in place. We also avoid needing to export verifier functions, instead we can simply export the ops structures for those that are needed outside the function they are defined in. This patch also fixes a directory block readahead verifier issue it exposed. This patch also adds ops callbacks to the inode/alloc btree blocks initialised by growfs. These will need more work before they will work with CRCs. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_ag.h | 4 +++ fs/xfs/xfs_alloc.c | 28 ++++++++++++--------- fs/xfs/xfs_alloc.h | 4 +-- fs/xfs/xfs_alloc_btree.c | 18 ++++++++------ fs/xfs/xfs_alloc_btree.h | 2 ++ fs/xfs/xfs_attr_leaf.c | 19 +++++++------- fs/xfs/xfs_attr_leaf.h | 3 ++- fs/xfs/xfs_bmap.c | 22 ++++++++--------- fs/xfs/xfs_bmap_btree.c | 20 ++++++++------- fs/xfs/xfs_bmap_btree.h | 3 +-- fs/xfs/xfs_btree.c | 26 +++++++++---------- fs/xfs/xfs_btree.h | 9 +++---- fs/xfs/xfs_buf.c | 63 ++++++++++++++++++++++++++++------------------- fs/xfs/xfs_buf.h | 24 ++++++++++-------- fs/xfs/xfs_da_btree.c | 40 +++++++++++++++++++----------- fs/xfs/xfs_da_btree.h | 4 +-- fs/xfs/xfs_dir2_block.c | 20 ++++++++------- fs/xfs/xfs_dir2_data.c | 52 +++++++++++++++++++++++++++++++------- fs/xfs/xfs_dir2_leaf.c | 36 +++++++++++++++------------ fs/xfs/xfs_dir2_node.c | 26 ++++++++++--------- fs/xfs/xfs_dir2_priv.h | 10 +++++--- fs/xfs/xfs_dquot.c | 18 ++++++++------ fs/xfs/xfs_dquot.h | 3 ++- fs/xfs/xfs_fsops.c | 29 +++++++++++++--------- fs/xfs/xfs_ialloc.c | 18 ++++++++------ fs/xfs/xfs_ialloc.h | 2 +- fs/xfs/xfs_ialloc_btree.c | 17 +++++++------ fs/xfs/xfs_ialloc_btree.h | 2 ++ fs/xfs/xfs_inode.c | 22 ++++++++++------- fs/xfs/xfs_inode.h | 3 +-- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 35 +++++++++++++++----------- fs/xfs/xfs_mount.h | 4 +-- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_trans.h | 6 ++--- fs/xfs/xfs_trans_buf.c | 8 +++--- 37 files changed, 357 insertions(+), 249 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 22bd4db011c..f2aeedb6a57 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -108,6 +108,8 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; + /* * Size of the unlinked inode hash table in the agi. */ @@ -161,6 +163,8 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 545a6c4c236..393055fe3ae 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -465,7 +465,7 @@ xfs_agfl_verify( #endif } -void +static void xfs_agfl_write_verify( struct xfs_buf *bp) { @@ -477,11 +477,13 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { xfs_agfl_verify(bp); - bp->b_pre_io = xfs_agfl_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -499,7 +501,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2181,23 +2183,25 @@ xfs_agf_verify( } } -void -xfs_agf_write_verify( +static void +xfs_agf_read_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); } static void -xfs_agf_read_verify( +xfs_agf_write_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); - bp->b_pre_io = xfs_agf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ @@ -2215,7 +2219,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index f32811f50f4..99d0a610155 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,7 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -void xfs_agf_write_verify(struct xfs_buf *bp); -void xfs_agfl_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b8339652491..b1ddef6b268 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -329,22 +329,25 @@ xfs_allocbt_verify( } static void -xfs_allocbt_write_verify( +xfs_allocbt_read_verify( struct xfs_buf *bp) { xfs_allocbt_verify(bp); } -void -xfs_allocbt_read_verify( +static void +xfs_allocbt_write_verify( struct xfs_buf *bp) { xfs_allocbt_verify(bp); - bp->b_pre_io = xfs_allocbt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -400,8 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, - .read_verify = xfs_allocbt_read_verify, - .write_verify = xfs_allocbt_write_verify, + .buf_ops = &xfs_allocbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 359fb86ed87..7e89a2b429d 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 5cd5b0c1d17..ee24993c7d1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -104,22 +104,23 @@ xfs_attr_leaf_verify( } static void -xfs_attr_leaf_write_verify( +xfs_attr_leaf_read_verify( struct xfs_buf *bp) { xfs_attr_leaf_verify(bp); } -void -xfs_attr_leaf_read_verify( +static void +xfs_attr_leaf_write_verify( struct xfs_buf *bp) { xfs_attr_leaf_verify(bp); - bp->b_pre_io = xfs_attr_leaf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { + .verify_read = xfs_attr_leaf_read_verify, + .verify_write = xfs_attr_leaf_write_verify, +}; int xfs_attr_leaf_read( @@ -130,7 +131,7 @@ xfs_attr_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, xfs_attr_leaf_read_verify); + XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); } /*======================================================================== @@ -924,7 +925,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - bp2->b_pre_io = bp1->b_pre_io; + bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -978,7 +979,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - bp->b_pre_io = xfs_attr_leaf_write_verify; + bp->b_ops = &xfs_attr_leaf_buf_ops; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 3bbf6277e43..77de139a58f 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -264,6 +264,7 @@ int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr_leaf_read_verify(struct xfs_buf *bp); + +extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 6a0f3f9f39d..0e92d12765d 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2663,7 +2663,7 @@ xfs_bmap_btree_to_extents( return error; #endif error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); @@ -3124,7 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - abp->b_pre_io = xfs_bmbt_write_verify; + abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3271,7 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_pre_io = xfs_bmbt_write_verify; + bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); @@ -4082,7 +4082,7 @@ xfs_bmap_read_extents( */ while (level-- > 0) { error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; block = XFS_BUF_TO_BLOCK(bp); @@ -4129,7 +4129,7 @@ xfs_bmap_read_extents( nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) xfs_btree_reada_bufl(mp, nextbno, 1, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); /* * Copy records into the extent records. */ @@ -4162,7 +4162,7 @@ xfs_bmap_read_extents( if (bno == NULLFSBLOCK) break; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; block = XFS_BUF_TO_BLOCK(bp); @@ -5880,7 +5880,7 @@ xfs_bmap_check_leaf_extents( bp_release = 1; error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) goto error_norelse; } @@ -5966,7 +5966,7 @@ xfs_bmap_check_leaf_extents( bp_release = 1; error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) goto error_norelse; } @@ -6061,7 +6061,7 @@ xfs_bmap_count_tree( int numrecs; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; @@ -6073,7 +6073,7 @@ xfs_bmap_count_tree( while (nextbno != NULLFSBLOCK) { error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; @@ -6105,7 +6105,7 @@ xfs_bmap_count_tree( bno = nextbno; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 79758e1e4f7..061b45cbe61 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -749,23 +749,26 @@ xfs_bmbt_verify( } } -void -xfs_bmbt_write_verify( +static void +xfs_bmbt_read_verify( struct xfs_buf *bp) { xfs_bmbt_verify(bp); } -void -xfs_bmbt_read_verify( +static void +xfs_bmbt_write_verify( struct xfs_buf *bp) { xfs_bmbt_verify(bp); - bp->b_pre_io = xfs_bmbt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -805,8 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, - .read_verify = xfs_bmbt_read_verify, - .write_verify = xfs_bmbt_write_verify, + .buf_ops = &xfs_bmbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 938c8598654..88469ca0869 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -232,11 +232,10 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); -extern void xfs_bmbt_read_verify(struct xfs_buf *bp); -extern void xfs_bmbt_write_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1e2d89eed2a..db010408d70 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -271,7 +271,7 @@ xfs_btree_dup_cursor( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -621,7 +621,7 @@ xfs_btree_read_bufl( uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ @@ -630,7 +630,7 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, verify); + mp->m_bsize, lock, &bp, ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -650,13 +650,13 @@ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } /* @@ -670,14 +670,14 @@ xfs_btree_reada_bufs( xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } STATIC int @@ -692,13 +692,13 @@ xfs_btree_readahead_lblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { xfs_btree_reada_bufl(cur->bc_mp, left, 1, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { xfs_btree_reada_bufl(cur->bc_mp, right, 1, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); rval++; } @@ -718,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1, cur->bc_ops->read_verify); + left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1, cur->bc_ops->read_verify); + right, 1, cur->bc_ops->buf_ops); rval++; } @@ -996,7 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; - (*bpp)->b_pre_io = cur->bc_ops->write_verify; + (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } @@ -1024,7 +1024,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags, bpp, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); if (error) return error; diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 458ab355089..f932897194e 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,8 +188,7 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); - void (*read_verify)(struct xfs_buf *bp); - void (*write_verify)(struct xfs_buf *bp); + const struct xfs_buf_ops *buf_ops; #ifdef DEBUG /* check that k1 is lower than k2 */ @@ -359,7 +358,7 @@ xfs_btree_read_bufl( uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -370,7 +369,7 @@ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -382,7 +381,7 @@ xfs_btree_reada_bufs( xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Initialise a new btree block header diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bd1a948ee39..26673a0b20e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -571,7 +571,7 @@ found: ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; - bp->b_pre_io = NULL; + bp->b_ops = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -657,7 +657,7 @@ xfs_buf_read_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -669,7 +669,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); - bp->b_iodone = verify; + bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -696,13 +696,13 @@ xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); } /* @@ -715,7 +715,7 @@ xfs_buf_read_uncached( xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -728,7 +728,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(target->bt_mount, bp); xfs_buf_iowait(bp); @@ -1001,27 +1001,37 @@ STATIC void xfs_buf_iodone_work( struct work_struct *work) { - xfs_buf_t *bp = + struct xfs_buf *bp = container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (read && bp->b_ops) + bp->b_ops->verify_read(bp); if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } } void xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) + struct xfs_buf *bp, + int schedule) { + bool read = !!(bp->b_flags & XBF_READ); + trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); @@ -1029,6 +1039,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); } } @@ -1316,6 +1327,20 @@ _xfs_buf_ioapply( rw |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; } else { @@ -1325,20 +1350,6 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; - /* - * run the pre-io callback function if it exists. If this function - * fails it will mark the buffer with an error and the IO should - * not be dispatched. - */ - if (bp->b_pre_io) { - bp->b_pre_io(bp); - if (bp->b_error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_CORRUPT_INCORE); - return; - } - } - /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 51bc16a1cd9..23f5642480b 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -111,6 +111,11 @@ struct xfs_buf_map { #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -154,9 +159,7 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ - - void (*b_pre_io)(struct xfs_buf *); - /* pre-io callback function */ + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; @@ -199,10 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, xfs_buf_iodone_t verify); + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); static inline struct xfs_buf * xfs_buf_get( @@ -221,10 +225,10 @@ xfs_buf_read( xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags, verify); + return xfs_buf_read_map(target, &map, 1, flags, ops); } static inline void @@ -232,10 +236,10 @@ xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1, verify); + return xfs_buf_readahead_map(target, &map, 1, ops); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -246,7 +250,7 @@ struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 087950fc2eb..4d7696a0241 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -117,6 +117,12 @@ xfs_da_node_write_verify( xfs_da_node_verify(bp); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void xfs_da_node_read_verify( struct xfs_buf *bp) @@ -129,10 +135,12 @@ xfs_da_node_read_verify( xfs_da_node_verify(bp); break; case XFS_ATTR_LEAF_MAGIC: - xfs_attr_leaf_read_verify(bp); + bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - xfs_dir2_leafn_read_verify(bp); + bp->b_ops = &xfs_dir2_leafn_buf_ops; + bp->b_ops->verify_read(bp); return; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, @@ -140,12 +148,14 @@ xfs_da_node_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - - bp->b_pre_io = xfs_da_node_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_da_node_buf_ops = { + .verify_read = xfs_da_node_read_verify, + .verify_write = xfs_da_node_write_verify, +}; + + int xfs_da_node_read( struct xfs_trans *tp, @@ -156,7 +166,7 @@ xfs_da_node_read( int which_fork) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, xfs_da_node_read_verify); + which_fork, &xfs_da_node_buf_ops); } /*======================================================================== @@ -193,7 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); - bp->b_pre_io = xfs_da_node_write_verify; + bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -394,7 +404,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_pre_io = blk1->bp->b_pre_io; + bp->b_ops = blk1->bp->b_ops; blk1->bp = bp; blk1->blkno = blkno; @@ -828,11 +838,11 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have - * to update the pre_io pointer as well to match the buffer type change + * to update the b_ops pointer as well to match the buffer type change * that could occur. */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); - root_blk->bp->b_pre_io = bp->b_pre_io; + root_blk->bp->b_ops = bp->b_ops; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -2223,7 +2233,7 @@ xfs_da_read_buf( xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, - xfs_buf_iodone_t verifier) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2245,7 +2255,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, verifier); + mapp, nmap, 0, &bp, ops); if (error) goto out_free; @@ -2303,7 +2313,7 @@ xfs_da_reada_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, int whichfork, - xfs_buf_iodone_t verifier) + const struct xfs_buf_ops *ops) { struct xfs_buf_map map; struct xfs_buf_map *mapp; @@ -2322,7 +2332,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 521b008445a..ee5170c46ae 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -229,10 +229,10 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, - xfs_buf_iodone_t verifier); + const struct xfs_buf_ops *ops); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, - int whichfork, xfs_buf_iodone_t verifier); + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e2fdc6f03d8..7536faaa61e 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -74,22 +74,24 @@ xfs_dir2_block_verify( } static void -xfs_dir2_block_write_verify( +xfs_dir2_block_read_verify( struct xfs_buf *bp) { xfs_dir2_block_verify(bp); } -void -xfs_dir2_block_read_verify( +static void +xfs_dir2_block_write_verify( struct xfs_buf *bp) { xfs_dir2_block_verify(bp); - bp->b_pre_io = xfs_dir2_block_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dir2_block_buf_ops = { + .verify_read = xfs_dir2_block_read_verify, + .verify_write = xfs_dir2_block_write_verify, +}; + static int xfs_dir2_block_read( struct xfs_trans *tp, @@ -99,7 +101,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, xfs_dir2_block_read_verify); + XFS_DATA_FORK, &xfs_dir2_block_buf_ops); } static void @@ -1010,7 +1012,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_pre_io = xfs_dir2_block_write_verify; + dbp->b_ops = &xfs_dir2_block_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1140,7 +1142,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } - bp->b_pre_io = xfs_dir2_block_write_verify; + bp->b_ops = &xfs_dir2_block_buf_ops; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index dcb8a873ab9..ffcf1774152 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -202,23 +202,57 @@ xfs_dir2_data_verify( } } -void -xfs_dir2_data_write_verify( +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir2_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + bp->b_ops = &xfs_dir2_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + xfs_dir2_data_verify(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } +} + +static void +xfs_dir2_data_read_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); } static void -xfs_dir2_data_read_verify( +xfs_dir2_data_write_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); - bp->b_pre_io = xfs_dir2_data_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dir2_data_buf_ops = { + .verify_read = xfs_dir2_data_read_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { + .verify_read = xfs_dir2_data_reada_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + int xfs_dir2_data_read( @@ -229,7 +263,7 @@ xfs_dir2_data_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, xfs_dir2_data_read_verify); + XFS_DATA_FORK, &xfs_dir2_data_buf_ops); } int @@ -240,7 +274,7 @@ xfs_dir2_data_readahead( xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, xfs_dir2_data_read_verify); + XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); } /* @@ -484,7 +518,7 @@ xfs_dir2_data_init( XFS_DATA_FORK); if (error) return error; - bp->b_pre_io = xfs_dir2_data_write_verify; + bp->b_ops = &xfs_dir2_data_buf_ops; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 3002ab7d54c..60cd2fa4e04 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -65,39 +65,43 @@ xfs_dir2_leaf_verify( } static void -xfs_dir2_leaf1_write_verify( +xfs_dir2_leaf1_read_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir2_leaf1_write_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - bp->b_pre_io = xfs_dir2_leaf1_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } void -xfs_dir2_leafn_write_verify( +xfs_dir2_leafn_read_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } void -xfs_dir2_leafn_read_verify( +xfs_dir2_leafn_write_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - bp->b_pre_io = xfs_dir2_leafn_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { + .verify_read = xfs_dir2_leaf1_read_verify, + .verify_write = xfs_dir2_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { + .verify_read = xfs_dir2_leafn_read_verify, + .verify_write = xfs_dir2_leafn_write_verify, +}; + static int xfs_dir2_leaf_read( struct xfs_trans *tp, @@ -107,7 +111,7 @@ xfs_dir2_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leaf1_read_verify); + XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); } int @@ -119,7 +123,7 @@ xfs_dir2_leafn_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leafn_read_verify); + XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); } /* @@ -198,7 +202,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ - dbp->b_pre_io = xfs_dir2_data_write_verify; + dbp->b_ops = &xfs_dir2_data_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -1264,12 +1268,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_pre_io = xfs_dir2_leaf1_write_verify; + bp->b_ops = &xfs_dir2_leaf1_buf_ops; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); } else - bp->b_pre_io = xfs_dir2_leafn_write_verify; + bp->b_ops = &xfs_dir2_leafn_buf_ops; *bpp = bp; return 0; } @@ -1954,7 +1958,7 @@ xfs_dir2_node_to_leaf( else xfs_dir2_leaf_log_header(tp, lbp); - lbp->b_pre_io = xfs_dir2_leaf1_write_verify; + lbp->b_ops = &xfs_dir2_leaf1_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index da90a91f442..5980f9b7fa9 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -72,22 +72,24 @@ xfs_dir2_free_verify( } static void -xfs_dir2_free_write_verify( +xfs_dir2_free_read_verify( struct xfs_buf *bp) { xfs_dir2_free_verify(bp); } -void -xfs_dir2_free_read_verify( +static void +xfs_dir2_free_write_verify( struct xfs_buf *bp) { xfs_dir2_free_verify(bp); - bp->b_pre_io = xfs_dir2_free_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { + .verify_read = xfs_dir2_free_read_verify, + .verify_write = xfs_dir2_free_write_verify, +}; + static int __xfs_dir2_free_read( @@ -98,7 +100,7 @@ __xfs_dir2_free_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_free_read_verify); + XFS_DATA_FORK, &xfs_dir2_free_buf_ops); } int @@ -201,7 +203,7 @@ xfs_dir2_leaf_to_node( XFS_DATA_FORK); if (error) return error; - fbp->b_pre_io = xfs_dir2_free_write_verify; + fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; leaf = lbp->b_addr; @@ -225,7 +227,7 @@ xfs_dir2_leaf_to_node( } free->hdr.nused = cpu_to_be32(n); - lbp->b_pre_io = xfs_dir2_leafn_write_verify; + lbp->b_ops = &xfs_dir2_leafn_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); /* @@ -636,7 +638,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_pre_io = xfs_dir2_data_write_verify; + curbp->b_ops = &xfs_dir2_data_buf_ops; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -651,7 +653,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_pre_io = xfs_dir2_data_write_verify; + curbp->b_ops = &xfs_dir2_data_buf_ops; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -1649,7 +1651,7 @@ xfs_dir2_node_addname_int( -1, &fbp, XFS_DATA_FORK); if (error) return error; - fbp->b_pre_io = xfs_dir2_free_write_verify; + fbp->b_ops = &xfs_dir2_free_buf_ops; /* * Initialize the new block to be empty, and remember diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 01b82dcddc3..7da79f6515f 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ +extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; + extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, xfs_off_t *offset, filldir_t filldir); @@ -45,7 +47,9 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #else #define xfs_dir2_data_check(dp,bp) #endif -extern void xfs_dir2_data_write_verify(struct xfs_buf *bp); + +extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; + extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -73,8 +77,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); -extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; + extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1b06aa05107..9e1bf5294c9 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -284,22 +284,24 @@ xfs_dquot_buf_verify( } static void -xfs_dquot_buf_write_verify( +xfs_dquot_buf_read_verify( struct xfs_buf *bp) { xfs_dquot_buf_verify(bp); } void -xfs_dquot_buf_read_verify( +xfs_dquot_buf_write_verify( struct xfs_buf *bp) { xfs_dquot_buf_verify(bp); - bp->b_pre_io = xfs_dquot_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + /* * Allocate a block and fill it with dquots. * This is called when the bmapi finds a hole. @@ -365,7 +367,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; - bp->b_pre_io = xfs_dquot_buf_write_verify; + bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log @@ -435,7 +437,7 @@ xfs_qm_dqrepair( ASSERT(*bpp == NULL); return XFS_ERROR(error); } - (*bpp)->b_pre_io = xfs_dquot_buf_write_verify; + (*bpp)->b_ops = &xfs_dquot_buf_ops; ASSERT(xfs_buf_islocked(*bpp)); d = (struct xfs_dqblk *)(*bpp)->b_addr; @@ -534,7 +536,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, xfs_dquot_buf_read_verify); + 0, &bp, &xfs_dquot_buf_ops); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 5438d883b62..c694a8469c4 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,7 +140,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); -extern void xfs_dquot_buf_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); @@ -162,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5d6d6b9d369..94eaeedc549 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -119,7 +119,8 @@ xfs_growfs_get_hdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags) + int flags, + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -130,6 +131,7 @@ xfs_growfs_get_hdr_buf( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; return bp; } @@ -217,12 +219,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agf_write_verify; agf = XFS_BUF_TO_AGF(bp); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -255,12 +257,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agfl_write_verify; agfl = XFS_BUF_TO_AGFL(bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) @@ -276,12 +278,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agi_write_verify; agi = XFS_BUF_TO_AGI(bp); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -306,7 +308,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; @@ -329,7 +332,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; goto error0; @@ -352,7 +356,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); if (!bp) { error = ENOMEM; goto error0; @@ -448,14 +453,14 @@ xfs_growfs_data_private( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, - xfs_sb_read_verify); + &xfs_sb_buf_ops); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0); if (bp) { + bp->b_ops = &xfs_sb_buf_ops; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_pre_io = xfs_sb_write_verify; } else error = ENOMEM; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index faf68600d3a..2d6495eaaa3 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -210,7 +210,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ - fbuf->b_pre_io = xfs_inode_buf_write_verify; + fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -1505,23 +1505,25 @@ xfs_agi_verify( xfs_check_agi_unlinked(agi); } -void -xfs_agi_write_verify( +static void +xfs_agi_read_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); } static void -xfs_agi_read_verify( +xfs_agi_write_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); - bp->b_pre_io = xfs_agi_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + /* * Read in the allocation group header (inode allocation section) */ @@ -1538,7 +1540,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify); + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 7a169e34e30..c8da3df271e 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,6 +150,6 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); -void xfs_agi_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 7761e1ebeff..bec344b3650 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -217,22 +217,24 @@ xfs_inobt_verify( } static void -xfs_inobt_write_verify( +xfs_inobt_read_verify( struct xfs_buf *bp) { xfs_inobt_verify(bp); } -void -xfs_inobt_read_verify( +static void +xfs_inobt_write_verify( struct xfs_buf *bp) { xfs_inobt_verify(bp); - bp->b_pre_io = xfs_inobt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -270,8 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .read_verify = xfs_inobt_read_verify, - .write_verify = xfs_inobt_write_verify, + .buf_ops = &xfs_inobt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index f782ad0c476..25c0239a8ea 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_inobt_buf_ops; + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dfcbe73f1db..66282dcb821 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -420,23 +420,27 @@ xfs_inode_buf_verify( xfs_inobp_check(mp, bp); } -void -xfs_inode_buf_write_verify( + +static void +xfs_inode_buf_read_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp); } -void -xfs_inode_buf_read_verify( +static void +xfs_inode_buf_write_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp); - bp->b_pre_io = xfs_inode_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the @@ -462,7 +466,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp, - xfs_inode_buf_read_verify); + &xfs_inode_buf_ops); if (error) { if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); @@ -1792,7 +1796,7 @@ xfs_ifree_cluster( * want it to fail. We can acheive this by adding a write * verifier to the buffer. */ - bp->b_pre_io = xfs_inode_buf_write_verify; + bp->b_ops = &xfs_inode_buf_ops; /* * Walk the inodes already attached to the buffer and mark them diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 482214d120a..22baf6ea4fa 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,8 +554,6 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); -void xfs_inode_buf_read_verify(struct xfs_buf *); -void xfs_inode_buf_write_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); @@ -600,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7f86fdaab7a..2ea7d402188 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -397,7 +397,7 @@ xfs_bulkstat( & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster, - xfs_inode_buf_read_verify); + &xfs_inode_buf_ops); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 924a4bc3d49..931e8e23f19 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3699,7 +3699,7 @@ xlog_do_recover( ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); - bp->b_iodone = xfs_sb_read_verify; + bp->b_ops = &xfs_sb_buf_ops; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 152a7fc843f..da508463ff1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -631,21 +631,11 @@ xfs_sb_verify( xfs_buf_ioerror(bp, error); } -void -xfs_sb_write_verify( - struct xfs_buf *bp) -{ - xfs_sb_verify(bp); -} - -void +static void xfs_sb_read_verify( struct xfs_buf *bp) { xfs_sb_verify(bp); - bp->b_pre_io = xfs_sb_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } /* @@ -654,7 +644,7 @@ xfs_sb_read_verify( * If we find an XFS superblock, the run a normal, noisy mount because we are * really going to mount it and want to know about errors. */ -void +static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { @@ -671,6 +661,23 @@ xfs_sb_quiet_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + /* * xfs_readsb * @@ -697,8 +704,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 0, - loud ? xfs_sb_read_verify - : xfs_sb_quiet_read_verify); + loud ? &xfs_sb_buf_ops + : &xfs_sb_quiet_buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 29c1b3ac920..bab8314507e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,12 +385,12 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ -extern void xfs_sb_read_verify(struct xfs_buf *); -extern void xfs_sb_write_verify(struct xfs_buf *bp); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern const struct xfs_buf_ops xfs_sb_buf_ops; + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index bd40ae9624e..e6a0af0ba00 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, - xfs_dquot_buf_read_verify); + &xfs_dquot_buf_ops); if (error) break; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f02d4029650..c6c0601abd7 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -474,7 +474,7 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); static inline int xfs_trans_read_buf( @@ -485,11 +485,11 @@ xfs_trans_read_buf( int numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_read_buf_map(mp, tp, target, &map, 1, - flags, bpp, verify); + flags, bpp, ops); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 977628207b4..4fc17d479d4 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -258,7 +258,7 @@ xfs_trans_read_buf_map( int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -266,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags, verify); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -315,7 +315,7 @@ xfs_trans_read_buf_map( ASSERT(!XFS_BUF_ISASYNC(bp)); ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -352,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags, verify); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? -- cgit v1.2.3 From bc02e8693d875c2a9b0037cfd37fe0b726d26403 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Nov 2012 09:20:37 +1100 Subject: xfs: add CRC infrastructure - add a mount feature bit for CRC enabled filesystems - add some helpers for generating and verifying the CRCs - add a copy_uuid helper The checksumming helpers are loosely based on similar ones in sctp, all other bits come from Dave Chinner. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Kconfig | 1 + fs/xfs/uuid.h | 6 ++++++ fs/xfs/xfs_cksum.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_sb.h | 7 ++++++ 5 files changed, 78 insertions(+) create mode 100644 fs/xfs/xfs_cksum.h (limited to 'fs/xfs') diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 6100ec0fa1d..5a7ffe54f5d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -2,6 +2,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h index 4732d71262c..104db0f3bed 100644 --- a/fs/xfs/uuid.h +++ b/fs/xfs/uuid.h @@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h new file mode 100644 index 00000000000..fad1676ad8c --- /dev/null +++ b/fs/xfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 0a134ca5211..fe7e4df85a7 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index f429d9d5d32..a05b45175fb 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -81,6 +81,7 @@ struct xfs_mount; #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ @@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); } +static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +{ + return (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); +} + /* * end of superblock version macros */ -- cgit v1.2.3 From 0e446be44806240c779666591bb9e8cb0e86a50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Nov 2012 22:54:24 +1100 Subject: xfs: add CRC checks to the log Implement CRCs for the log buffers. We re-use a field in struct xlog_rec_header that was used for a weak checksum of the log buffer payload in debug builds before. The new checksumming uses the crc32c checksum we will use elsewhere in XFS, and also protects the record header and addition cycle data. Due to this there are some interesting changes in xlog_sync, as we need to do the cycle wrapping for the split buffer case much earlier, as we would touch the buffer after generating the checksum otherwise. The CRC calculation is always enabled, even for non-CRC filesystems, as adding this CRC does not change the log format. On non-CRC filesystems, only issue an alert if a CRC mismatch is found and allow recovery to continue - this will act as an indicator that log recovery problems are a result of log corruption. On CRC enabled filesystems, however, log recovery will fail. Note that existing debug kernels will write a simple checksum value to the log, so the first time this is run on a filesystem taht was last used on a debug kernel it will through CRC mismatch warning errors. These can be ignored. Initially based on a patch from Dave Chinner, then modified significantly by Christoph Hellwig. Modified again by Dave Chinner to get to this version. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 132 ++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_log_priv.h | 11 ++-- fs/xfs/xfs_log_recover.c | 132 ++++++++++++++++++++++------------------------- 3 files changed, 176 insertions(+), 99 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1d6d2ee0849..c6d6e136ba7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -35,6 +35,7 @@ #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -1489,6 +1490,84 @@ xlog_grant_push_ail( xfs_ail_push(log->l_ailp, threshold_lsn); } +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__be32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + /* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to @@ -1549,7 +1628,6 @@ xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1558,6 +1636,7 @@ xlog_sync( int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1588,13 +1667,10 @@ xlog_sync( xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1603,12 +1679,36 @@ xlog_sync( /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1662,19 +1762,6 @@ xlog_sync( bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1691,7 +1778,6 @@ xlog_sync( return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9a4e0e5ec32..dc3498bf17c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i) /* * Flags for log structure */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being @@ -291,7 +290,7 @@ typedef struct xlog_rec_header { __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; @@ -555,11 +554,9 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); -extern void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int); + +extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 931e8e23f19..9c3651c9e75 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -41,6 +41,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -3216,80 +3217,58 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - struct xlog *log, - struct xlog_in_core *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __be32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + be32_to_cpu(rhead->h_crc), + be32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3306,6 +3285,8 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int @@ -3437,9 +3418,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3549,9 +3534,14 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3576,9 +3566,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } -- cgit v1.2.3 From 7c4cebe8e02dd0b0e655605442bbe9268db9ed4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 23 Nov 2012 14:24:23 +1100 Subject: xfs: inode allocation should use unmapped buffers. Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 2d6495eaaa3..a815412eab8 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,7 +200,8 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!fbuf) return ENOMEM; /* -- cgit v1.2.3 From ef9d873344ff9f5084eacb9f3735982314dfda9e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 29 Nov 2012 15:26:33 +1100 Subject: xfs: byte range granularity for XFS_IOC_ZERO_RANGE XFS_IOC_ZERO_RANGE simply does not work properly for non page cache aligned ranges. Neither test 242 or 290 exercise this correctly, so the behaviour is completely busted even though the tests pass. Fix it to support full byte range granularity as was originally intended for this ioctl. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_vnodeops.c | 96 ++++++++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_vnodeops.h | 1 + 3 files changed, 77 insertions(+), 22 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 400b187595b..67284edb84d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -86,7 +86,7 @@ xfs_rw_ilock_demote( * valid before the operation, it will be read from disk before * being partially zeroed. */ -STATIC int +int xfs_iozero( struct xfs_inode *ip, /* inode */ loff_t pos, /* offset in file */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 26880793fec..d95f565a390 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2095,6 +2095,73 @@ xfs_free_file_space( return error; } + +STATIC int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (start_boundary < end_boundary - 1) { + /* punch out the page cache over the conversion range */ + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, + attr_flags); + if (error) + goto out_unlock; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out_unlock; + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out_unlock: + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + +} + /* * xfs_change_file_space() * This routine allocates or frees disk space for the given file. @@ -2120,10 +2187,8 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t end; xfs_trans_t *tp; struct iattr iattr; - int prealloc_type; if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2172,31 +2237,20 @@ xfs_change_file_space( startoffset = bf->l_start; fsize = XFS_ISIZE(ip); - /* - * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve - * file space. - * These calls do NOT zero the data space allocated to the file, - * nor do they change the file size. - * - * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file - * space. - * These calls cause the new file data to be zeroed and the file - * size to be changed. - */ setprealloc = clrprealloc = 0; - prealloc_type = XFS_BMAPI_PREALLOC; - switch (cmd) { case XFS_IOC_ZERO_RANGE: - prealloc_type |= XFS_BMAPI_CONVERT; - end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; - if (startoffset <= end) - truncate_pagecache_range(VFS_I(ip), startoffset, end); - /* FALLTHRU */ + error = xfs_zero_file_space(ip, startoffset, bf->l_len, + attr_flags); + if (error) + return error; + setprealloc = 1; + break; + case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - prealloc_type, attr_flags); + XFS_BMAPI_PREALLOC, attr_flags); if (error) return error; setprealloc = 1; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 91a03fa3814..5163022d980 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -49,6 +49,7 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); -- cgit v1.2.3 From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:00 +1100 Subject: xfs: fix direct IO nested transaction deadlock. The direct IO path can do a nested transaction reservation when writing past the EOF. The first transaction is the append transaction for setting the filesize at IO completion, but we can also need a transaction for allocation of blocks. If the log is low on space due to reservations and small log, the append transaction can be granted after wating for space as the only active transaction in the system. This then attempts a reservation for an allocation, which there isn't space in the log for, and the reservation sleeps. The result is that there is nothing left in the system to wake up all the processes waiting for log space to come free. The stack trace that shows this deadlock is relatively innocuous: xlog_grant_head_wait xlog_grant_head_check xfs_log_reserve xfs_trans_reserve xfs_iomap_write_direct __xfs_get_blocks xfs_get_blocks_direct do_blockdev_direct_IO __blockdev_direct_IO xfs_vm_direct_IO generic_file_direct_write xfs_file_dio_aio_writ xfs_file_aio_write do_sync_write vfs_write This was discovered on a filesystem with a log of only 10MB, and a log stripe unit of 256k whih increased the base reservations by 512k. Hence a allocation transaction requires 1.2MB of log space to be available instead of only 260k, and so greatly increased the chance that there wouldn't be enough log space available for the nested transaction to succeed. The key to reproducing it is this mkfs command: mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV The test case was a 1000 fsstress processes running with random freeze and unfreezes every few seconds. Thanks to Eryu Guan (eguan@redhat.com) for writing the test that found this on a system with a somewhat unique default configuration.... cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 81 ++++++++++++++++++++----------------------------------- fs/xfs/xfs_log.c | 3 ++- 2 files changed, 31 insertions(+), 53 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71361da1f77..4111a40ebe1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1460,15 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - rwsem_acquire_read( - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c6d6e136ba7..c49e2c12dba 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -460,7 +460,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); -- cgit v1.2.3 From b870553cdecb26d5291af09602352b763e323df2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:02 +1100 Subject: xfs: fix stray dquot unlock when reclaiming dquots When we fail to get a dquot lock during reclaim, we jump to an error handler that unlocks the dquot. This is wrong as we didn't lock the dquot, and unlocking it means who-ever is holding the lock has had it silently taken away, and hence it results in a lock imbalance. Found by inspection while modifying the code for the numa-lru patchset. This fixes a random hang I've been seeing on xfstest 232 for the past several months. cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_qm.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e6a0af0ba00..60eff476315 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1456,7 +1456,7 @@ xfs_qm_dqreclaim_one( int error; if (!xfs_dqlock_nowait(dqp)) - goto out_busy; + goto out_move_tail; /* * This dquot has acquired a reference in the meantime remove it from @@ -1479,7 +1479,7 @@ xfs_qm_dqreclaim_one( * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) - goto out_busy; + goto out_unlock_move_tail; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -1490,7 +1490,7 @@ xfs_qm_dqreclaim_one( if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); - goto out_busy; + goto out_unlock_move_tail; } xfs_buf_delwri_queue(bp, buffer_list); @@ -1499,7 +1499,7 @@ xfs_qm_dqreclaim_one( * Give the dquot another try on the freelist, as the * flushing will take some time. */ - goto out_busy; + goto out_unlock_move_tail; } xfs_dqfunlock(dqp); @@ -1518,14 +1518,13 @@ xfs_qm_dqreclaim_one( XFS_STATS_INC(xs_qm_dqreclaims); return; -out_busy: - xfs_dqunlock(dqp); - /* * Move the dquot to the tail of the list so that we don't spin on it. */ +out_unlock_move_tail: + xfs_dqunlock(dqp); +out_move_tail: list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(xs_qm_dqreclaim_misses); } -- cgit v1.2.3 From f9668a09e32ac6d2aa22f44cc310e430a8f4a40f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:03 +1100 Subject: xfs: fix sparse reported log CRC endian issue Not a bug as such, just warning noise from the xlog_cksum() returning a __be32 type when it should be returning a __le32 type. On Wed, Nov 28, 2012 at 08:30:59AM -0500, Christoph Hellwig wrote: > But why are we storing the crc field little endian while all other on > disk formats are big endian? (And yes I realize it might as well have > been me who did that back in the idea, but I still have no idea why) Because the CRC always returns the calcuation LE format, even on BE systems. So rather than always having to byte swap it everywhere and have all the force casts and anootations for sparse, it seems simpler to just make it a __le32 everywhere.... Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_priv.h | 2 +- fs/xfs/xfs_log_recover.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c49e2c12dba..46bd9d52ab5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1538,7 +1538,7 @@ xlog_pack_data( * This is a little more complicated than it should be because the various * headers and the actual data are non-contiguous. */ -__be32 +__le32 xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index dc3498bf17c..16d8d12ea3b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -555,7 +555,7 @@ extern int xlog_recover_finish( struct xlog *log); -extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9c3651c9e75..96fcbb85ff8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3233,15 +3233,15 @@ xlog_unpack_data_crc( xfs_caddr_t dp, struct xlog *log) { - __be32 crc; + __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.\n", - be32_to_cpu(rhead->h_crc), - be32_to_cpu(crc)); + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } -- cgit v1.2.3 From d44d9bc68e32ad5881b105f82bd259d261d1ef74 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:02 -0600 Subject: xfs: use b_maps[] for discontiguous buffers Commits starting at 77c1a08 introduced a multiple segment support to xfs_buf. xfs_trans_buf_item_match() could not find a multi-segment buffer in the transaction because it was looking at the single segment block number rather than the multi-segment b_maps[0].bm.bn. This results on a recursive buffer lock that can never be satisfied. This patch: 1) Changed the remaining b_map accesses to be b_maps[0] accesses. 2) Renames the single segment b_map structure to __b_map to avoid future confusion. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 12 ++++++------ fs/xfs/xfs_buf.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 26673a0b20e..56d1614760c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -175,7 +175,7 @@ xfs_buf_get_maps( bp->b_map_count = map_count; if (map_count == 1) { - bp->b_maps = &bp->b_map; + bp->b_maps = &bp->__b_map; return 0; } @@ -193,7 +193,7 @@ static void xfs_buf_free_maps( struct xfs_buf *bp) { - if (bp->b_maps != &bp->b_map) { + if (bp->b_maps != &bp->__b_map) { kmem_free(bp->b_maps); bp->b_maps = NULL; } @@ -377,8 +377,8 @@ xfs_buf_allocate_memory( } use_alloc_page: - start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); @@ -640,7 +640,7 @@ _xfs_buf_read( xfs_buf_flags_t flags) { ASSERT(!(flags & XBF_WRITE)); - ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -1709,7 +1709,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_map.bm_bn - bp->b_map.bm_bn; + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 23f5642480b..433a12ed7b1 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -151,7 +151,7 @@ typedef struct xfs_buf { struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ - struct xfs_buf_map b_map; /* inline compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ @@ -330,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp); * In future, uncached buffers will pass the block number directly to the io * request function and hence these macros will go away at that point. */ -#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { -- cgit v1.2.3 From 0f22f9d0cd8a630b40a9ccc07a8844345b185aae Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:03 -0600 Subject: xfs: rename bli_format to avoid confusion with bli_formats Rename the bli_format structure to __bli_format to avoid accidently confusing them with the bli_formats pointer. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 22 +++++++++++----------- fs/xfs/xfs_buf_item.h | 2 +- fs/xfs/xfs_trans_buf.c | 24 ++++++++++++------------ 3 files changed, 24 insertions(+), 24 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index becf4a97efc..1975b3d9007 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -71,7 +71,7 @@ xfs_buf_item_log_debug( chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &(bip->__bli_format.blf_data_map[word_num]); bit_set = *wordp & (1 << bit_num); ASSERT(bit_set); byte++; @@ -237,7 +237,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); return bip->bli_format_count; } @@ -278,7 +278,7 @@ xfs_buf_item_format_segment( uint buffer_offset; /* copy the flags across from the base format item */ - blfp->blf_flags = bip->bli_format.blf_flags; + blfp->blf_flags = bip->__bli_format.blf_flags; /* * Base size is the actual size of the ondisk structure - it reflects @@ -371,7 +371,7 @@ xfs_buf_item_format_segment( nbits++; } } - bip->bli_format.blf_size = nvecs; + bip->__bli_format.blf_size = nvecs; return vecp; } @@ -405,7 +405,7 @@ xfs_buf_item_format( if (bip->bli_flags & XFS_BLI_INODE_BUF) { if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } @@ -485,7 +485,7 @@ xfs_buf_item_unpin( ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); @@ -631,7 +631,7 @@ xfs_buf_item_unlock( */ if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; @@ -644,8 +644,8 @@ xfs_buf_item_unlock( * If the buf item isn't tracking any data, free it, otherwise drop the * reference we hold to it. */ - if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) + if (xfs_bitmap_empty(bip->__bli_format.blf_data_map, + bip->__bli_format.blf_map_size)) xfs_buf_item_relse(bp); else atomic_dec(&bip->bli_refcount); @@ -716,7 +716,7 @@ xfs_buf_item_get_format( bip->bli_format_count = count; if (count == 1) { - bip->bli_formats = &bip->bli_format; + bip->bli_formats = &bip->__bli_format; return 0; } @@ -731,7 +731,7 @@ STATIC void xfs_buf_item_free_format( struct xfs_buf_log_item *bip) { - if (bip->bli_formats != &bip->bli_format) { + if (bip->bli_formats != &bip->__bli_format) { kmem_free(bip->bli_formats); bip->bli_formats = NULL; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 6850f49f4af..16def435944 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -104,7 +104,7 @@ typedef struct xfs_buf_log_item { #endif int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ - struct xfs_buf_log_format bli_format; /* embedded in-log header */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 4fc17d479d4..f7510bf6828 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -93,7 +93,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = bp->b_fspriv; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -432,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = bp->b_fspriv; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_brelse(bip); @@ -519,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; @@ -539,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); @@ -598,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } tp->t_flags |= XFS_TRANS_DIRTY; @@ -657,8 +657,8 @@ xfs_trans_binval( */ ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -668,10 +668,10 @@ xfs_trans_binval( bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLF_CANCEL; - memset((char *)(bip->bli_format.blf_data_map), 0, - (bip->bli_format.blf_map_size * sizeof(uint))); + bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + memset((char *)(bip->__bli_format.blf_data_map), 0, + (bip->__bli_format.blf_map_size * sizeof(uint))); bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } @@ -775,5 +775,5 @@ xfs_trans_dquot_buf( type == XFS_BLF_GDQUOT_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= type; + bip->__bli_format.blf_flags |= type; } -- cgit v1.2.3 From 2d0e9df579029b62adc72b50977182757cc04cd5 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:04 -0600 Subject: xfs: fix segment in xfs_buf_item_format_segment Not every segment in a multi-segment buffer is dirty in a transaction and they will not be outputted. The assert in xfs_buf_item_format_segment() that checks for the at least one chunk of data in the segment to be used is not necessary true for multi-segmented buffers. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1975b3d9007..c48e60bd857 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -287,6 +287,17 @@ xfs_buf_item_format_segment( */ base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + + nvecs = 0; + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + goto out; + } + vecp->i_addr = blfp; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; @@ -301,15 +312,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - blfp->blf_size = nvecs; - return vecp; + goto out; } /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); - ASSERT(first_bit != -1); + last_bit = first_bit; nbits = 1; for (;;) { @@ -371,7 +380,8 @@ xfs_buf_item_format_segment( nbits++; } } - bip->__bli_format.blf_size = nvecs; +out: + blfp->blf_size = nvecs; return vecp; } -- cgit v1.2.3 From 91e4bac0b72736410c88632906953f14259144b1 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Tue, 4 Dec 2012 17:18:05 -0600 Subject: xfs: fix the multi-segment log buffer format Per Dave Chinner suggestion, this patch: 1) Corrects the detection of whether a multi-segment buffer is still tracking data. 2) Clears all the buffer log formats for a multi-segment buffer. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 13 ++++++++++--- fs/xfs/xfs_trans_buf.c | 7 +++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index c48e60bd857..77b09750e92 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -611,7 +611,7 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted; + int aborted, clean, i; uint hold; /* Clear the buffer's association with this transaction. */ @@ -654,8 +654,15 @@ xfs_buf_item_unlock( * If the buf item isn't tracking any data, free it, otherwise drop the * reference we hold to it. */ - if (xfs_bitmap_empty(bip->__bli_format.blf_data_map, - bip->__bli_format.blf_map_size)) + clean = 1; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = 0; + break; + } + } + if (clean) xfs_buf_item_relse(bp); else atomic_dec(&bip->bli_refcount); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index f7510bf6828..3edf5dbee00 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -643,6 +643,7 @@ xfs_trans_binval( xfs_buf_t *bp) { xfs_buf_log_item_t *bip = bp->b_fspriv; + int i; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -670,8 +671,10 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; - memset((char *)(bip->__bli_format.blf_data_map), 0, - (bip->__bli_format.blf_map_size * sizeof(uint))); + for (i = 0; i < bip->bli_format_count; i++) { + memset(bip->bli_formats[i].blf_data_map, 0, + (bip->bli_formats[i].blf_map_size * sizeof(uint))); + } bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } -- cgit v1.2.3 From ab7eac22008f044631c0a3f4be344ebc2cb0e266 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 21 Dec 2012 10:45:17 -0500 Subject: xfs: remove int casts from debug dquot soft limit timer asserts The int casts here make it easy to trigger an assert with a large soft limit. For example, set a >4TB soft limit on an empty volume to reproduce a (0 > -x) comparison due to an overflow of d_blk_softlimit. Signed-off-by: Brian Foster Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_qm_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5f53e75409b..8a59f854655 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -784,11 +784,11 @@ xfs_qm_scall_getquota( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && + if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && + if ((dst->d_icount > dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } -- cgit v1.2.3 From 37f13561de6039b3a916d1510086030d097dea0f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 10 Jan 2013 10:41:48 -0600 Subject: xfs: recalculate leaf entry pointer after compacting a dir2 block Dave Jones hit this assert when doing a compile on recent git, with CONFIG_XFS_DEBUG enabled: XFS: Assertion failed: (char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)), file: fs/xfs/xfs_dir2_data.c, line: 828 Upon further digging, the tag found by xfs_dir2_data_unused_tag_p(dup) contained "2" and not the proper offset, and I found that this value was changed after the memmoves under "Use a stale leaf for our new entry." in xfs_dir2_block_addname(), i.e. memmove(&blp[mid + 1], &blp[mid], (highstale - mid) * sizeof(*blp)); overwrote it. What has happened is that the previous call to xfs_dir2_block_compact() has rearranged things; it changes btp->count as well as the blp array. So after we make that call, we must recalculate the proper pointer to the leaf entries by making another call to xfs_dir2_block_leaf_p(). Dave provided a metadump image which led to a simple reproducer (create a particular filename in the affected directory) and this resolves the testcase as well as the bug on his live system. Thanks also to dchinner for looking at this one with me. Signed-off-by: Eric Sandeen Tested-by: Dave Jones Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 7536faaa61e..12afe07a91d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -355,10 +355,12 @@ xfs_dir2_block_addname( /* * If need to compact the leaf entries, do it now. */ - if (compact) + if (compact) { xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); - else if (btp->stale) { + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { /* * Set leaf logging boundaries to impossible state. * For the no-stale case they're set explicitly. -- cgit v1.2.3 From 1bee12b8c44d825fb45cd6a13e76c185ed6888b8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 16 Jan 2013 17:33:53 -0600 Subject: xfs: Do not return EFSCORRUPTED when filesystem probe finds no XFS magic 9802182 changed the return value from EWRONGFS (aka EINVAL) to EFSCORRUPTED which doesn't seem to be handled properly by the root filesystem probe. Signed-off-by: Eric Sandeen Tested-by: Sergei Trofimovich Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da508463ff1..7d6df7c00c3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify( return; } /* quietly fail */ - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, EWRONGFS); } static void -- cgit v1.2.3 From d26978dd866dbb3b3a9690f3655a5e735055de89 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 17 Jan 2013 13:11:29 -0500 Subject: xfs: pull up stack_switch check into xfs_bmapi_write The stack_switch check currently occurs in __xfs_bmapi_allocate, which means the stack switch only occurs when xfs_bmapi_allocate() is called in a loop. Pull the check up before the loop in xfs_bmapi_write() such that the first iteration of the loop has consistent behavior. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12765d..cdb2d334858 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4680,9 +4680,6 @@ __xfs_bmapi_allocate( return error; } - if (bma->flags & XFS_BMAPI_STACK_SWITCH) - bma->stack_switch = 1; - error = xfs_bmap_alloc(bma); if (error) return error; @@ -4956,6 +4953,9 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; + if (flags & XFS_BMAPI_STACK_SWITCH) + bma.stack_switch = 1; + while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); -- cgit v1.2.3 From eb178619f930fa2ba2348de332a1ff1c66a31424 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:52 +1100 Subject: xfs: fix _xfs_buf_find oops on blocks beyond the filesystem end When _xfs_buf_find is passed an out of range address, it will fail to find a relevant struct xfs_perag and oops with a null dereference. This can happen when trying to walk a filesystem with a metadata inode that has a partially corrupted extent map (i.e. the block number returned is corrupt, but is otherwise intact) and we try to read from the corrupted block address. In this case, just fail the lookup. If it is readahead being issued, it will simply not be done, but if it is real read that fails we will get an error being reported. Ideally this case should result in an EFSCORRUPTED error being reported, but we cannot return an error through xfs_buf_read() or xfs_buf_get() so this lookup failure may result in ENOMEM or EIO errors being reported instead. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 56d1614760c..689d72655ea 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -487,6 +487,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -498,6 +499,23 @@ _xfs_buf_find( ASSERT(!(numbytes < (1 << btp->bt_sshift))); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + return NULL; + } + /* get tree root */ pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, blkno)); -- cgit v1.2.3 From f2a459565b02b60408f3f2e5ca992a031319712b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:54 +1100 Subject: xfs: limit speculative prealloc near ENOSPC thresholds There is a window on small filesytsems where specualtive preallocation can be larger than that ENOSPC throttling thresholds, resulting in specualtive preallocation trying to reserve more space than there is space available. This causes immediate ENOSPC to be triggered, prealloc to be turned off and flushing to occur. One the next write (i.e. next 4k page), we do exactly the same thing, and so effective drive into synchronous 4k writes by triggering ENOSPC flushing on every page while in the window between the prealloc size and the ENOSPC prealloc throttle threshold. Fix this by checking to see if the prealloc size would consume all free space, and throttle it appropriately to avoid premature ENOSPC... Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers --- fs/xfs/xfs_iomap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4e9a6..364818eef40 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -351,6 +351,15 @@ xfs_iomap_prealloc_size( } if (shift) alloc_blocks >>= shift; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks >= freesp) + alloc_blocks >>= 4; } if (alloc_blocks < mp->m_writeio_blocks) -- cgit v1.2.3 From 9f87832a82923943aaab38b8d53658af134bbfa4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:55 +1100 Subject: xfs: fix shutdown hang on invalid inode during create When the new inode verify in xfs_iread() fails, the create transaction is aborted and a shutdown occurs. The subsequent unmount then hangs in xfs_wait_buftarg() on a buffer that has an elevated hold count. Debug showed that it was an AGI buffer getting stuck: [ 22.576147] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 22.976213] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 23.376206] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 23.776325] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck The trace of this buffer leading up to the shutdown (trimmed for brevity) looks like: xfs_buf_init: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_get_map xfs_buf_get: bno 0x2 len 0x200 hold 1 caller xfs_buf_read_map xfs_buf_read: bno 0x2 len 0x200 hold 1 caller xfs_trans_read_buf_map xfs_buf_iorequest: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_hold: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_iorequest xfs_buf_rele: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_iorequest xfs_buf_iowait: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_ioerror: bno 0x2 len 0x200 hold 1 caller xfs_buf_bio_end_io xfs_buf_iodone: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_ioend xfs_buf_iowait_done: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_hold: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_item_init xfs_trans_read_buf: bno 0x2 len 0x200 hold 2 recur 0 refcount 1 xfs_trans_brelse: bno 0x2 len 0x200 hold 2 recur 0 refcount 1 xfs_buf_item_relse: bno 0x2 nblks 0x1 hold 2 caller xfs_trans_brelse xfs_buf_rele: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_item_relse xfs_buf_unlock: bno 0x2 nblks 0x1 hold 1 caller xfs_trans_brelse xfs_buf_rele: bno 0x2 nblks 0x1 hold 1 caller xfs_trans_brelse xfs_buf_trylock: bno 0x2 nblks 0x1 hold 2 caller _xfs_buf_find xfs_buf_find: bno 0x2 len 0x200 hold 2 caller xfs_buf_get_map xfs_buf_get: bno 0x2 len 0x200 hold 2 caller xfs_buf_read_map xfs_buf_read: bno 0x2 len 0x200 hold 2 caller xfs_trans_read_buf_map xfs_buf_hold: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_item_init xfs_trans_read_buf: bno 0x2 len 0x200 hold 3 recur 0 refcount 1 xfs_trans_log_buf: bno 0x2 len 0x200 hold 3 recur 0 refcount 1 xfs_buf_item_unlock: bno 0x2 len 0x200 hold 3 flags DIRTY liflags ABORTED xfs_buf_unlock: bno 0x2 nblks 0x1 hold 3 caller xfs_buf_item_unlock xfs_buf_rele: bno 0x2 nblks 0x1 hold 3 caller xfs_buf_item_unlock And that is the AGI buffer from cold cache read into memory to transaction abort. You can see at transaction abort the bli is dirty and only has a single reference. The item is not pinned, and it's not in the AIL. Hence the only reference to it is this transaction. The problem is that the xfs_buf_item_unlock() call is dropping the last reference to the xfs_buf_log_item attached to the buffer (which holds a reference to the buffer), but it is not freeing the xfs_buf_log_item. Hence nothing will ever release the buffer, and the unmount hangs waiting for this reference to go away. The fix is simple - xfs_buf_item_unlock needs to detect the last reference going away in this case and free the xfs_buf_log_item to release the reference it holds on the buffer. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 2 ++ fs/xfs/xfs_buf_item.c | 12 ++++++++++-- fs/xfs/xfs_trace.h | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 689d72655ea..fbbb9eb92e3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1505,6 +1505,8 @@ restart: while (!list_empty(&btp->bt_lru)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); if (atomic_read(&bp->b_hold) > 1) { + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + list_move_tail(&bp->b_lru, &btp->bt_lru); spin_unlock(&btp->bt_lru_lock); delay(100); goto restart; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 77b09750e92..3f9949fee39 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -652,7 +652,10 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. */ clean = 1; for (i = 0; i < bip->bli_format_count; i++) { @@ -664,7 +667,12 @@ xfs_buf_item_unlock( } if (clean) xfs_buf_item_relse(bp); - else + else if (aborted) { + if (atomic_dec_and_test(&bip->bli_refcount)) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_buf_item_relse(bp); + } + } else atomic_dec(&bip->bli_refcount); if (!hold) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4a85a..16a812977ea 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); -- cgit v1.2.3 From 4b05d09c18d9aa62d2e7fb4b057f54e5a38963f5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jan 2013 13:56:18 +0100 Subject: xfs: Fix possible use-after-free with AIO Running AIO is pinning inode in memory using file reference. Once AIO is completed using aio_complete(), file reference is put and inode can be freed from memory. So we have to be sure that calling aio_complete() is the last thing we do with the inode. CC: xfs@oss.sgi.com CC: Ben Myers CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40ebe1..5f707e53717 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { + inode_dio_done(ioend->io_inode); if (ioend->io_isasync) { aio_complete(ioend->io_iocb, ioend->io_error ? ioend->io_error : ioend->io_result, 0); } - inode_dio_done(ioend->io_inode); } mempool_free(ioend, xfs_ioend_pool); -- cgit v1.2.3 From 65e3aa77f1b0269720660a6879f6f28d158f54c8 Mon Sep 17 00:00:00 2001 From: Torsten Kaiser Date: Sun, 20 Jan 2013 10:24:49 +0100 Subject: xfs: Fix xfs_swap_extents() after removal of xfs_flushinval_pages() Commit fb59581404ab7ec5075299065c22cb211a9262a9 removed xfs_flushinval_pages() and changed its callers to use filemap_write_and_wait() and truncate_pagecache_range() directly. But in xfs_swap_extents() this change accidental switched the argument for 'tip' to 'ip'. This patch switches it back to 'tip' Signed-off-by: Torsten Kaiser Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_dfrag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d0e9c74d3d9..a8bd26b82ec 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,10 +246,10 @@ xfs_swap_extents( goto out_unlock; } - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(ip), 0, -1); + truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { -- cgit v1.2.3